mirror of
https://github.com/xmrig/xmrig.git
synced 2026-04-18 05:22:28 -04:00
Move files.
This commit is contained in:
102
src/crypto/cn/Asm.cpp
Normal file
102
src/crypto/cn/Asm.cpp
Normal file
@@ -0,0 +1,102 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# define strncasecmp _strnicmp
|
||||
# define strcasecmp _stricmp
|
||||
#endif
|
||||
|
||||
|
||||
#include "crypto/cn/Asm.h"
|
||||
#include "rapidjson/document.h"
|
||||
|
||||
|
||||
static const char *asmNames[] = {
|
||||
"none",
|
||||
"auto",
|
||||
"intel",
|
||||
"ryzen",
|
||||
"bulldozer"
|
||||
};
|
||||
|
||||
|
||||
xmrig::Assembly xmrig::Asm::parse(const char *assembly, Assembly defaultValue)
|
||||
{
|
||||
constexpr size_t const size = sizeof(asmNames) / sizeof((asmNames)[0]);
|
||||
assert(assembly != nullptr);
|
||||
assert(ASM_MAX == size);
|
||||
|
||||
if (assembly == nullptr) {
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (strcasecmp(assembly, asmNames[i]) == 0) {
|
||||
return static_cast<Assembly>(i);
|
||||
}
|
||||
}
|
||||
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
|
||||
xmrig::Assembly xmrig::Asm::parse(const rapidjson::Value &value, Assembly defaultValue)
|
||||
{
|
||||
if (value.IsBool()) {
|
||||
return parse(value.GetBool());
|
||||
}
|
||||
|
||||
if (value.IsString()) {
|
||||
return parse(value.GetString(), defaultValue);
|
||||
}
|
||||
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
|
||||
const char *xmrig::Asm::toString(Assembly assembly)
|
||||
{
|
||||
return asmNames[assembly];
|
||||
}
|
||||
|
||||
|
||||
rapidjson::Value xmrig::Asm::toJSON(Assembly assembly)
|
||||
{
|
||||
using namespace rapidjson;
|
||||
|
||||
if (assembly == ASM_NONE) {
|
||||
return Value(false);
|
||||
}
|
||||
|
||||
if (assembly == ASM_AUTO) {
|
||||
return Value(true);
|
||||
}
|
||||
|
||||
return Value(StringRef(toString(assembly)));
|
||||
}
|
||||
50
src/crypto/cn/Asm.h
Normal file
50
src/crypto/cn/Asm.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_ASM_H
|
||||
#define XMRIG_ASM_H
|
||||
|
||||
|
||||
#include "common/xmrig.h"
|
||||
#include "rapidjson/fwd.h"
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
class Asm
|
||||
{
|
||||
public:
|
||||
static Assembly parse(const char *assembly, Assembly defaultValue = ASM_AUTO);
|
||||
static Assembly parse(const rapidjson::Value &value, Assembly defaultValue = ASM_AUTO);
|
||||
static const char *toString(Assembly assembly);
|
||||
static rapidjson::Value toJSON(Assembly assembly);
|
||||
|
||||
inline static Assembly parse(bool enable) { return enable ? ASM_AUTO : ASM_NONE; }
|
||||
};
|
||||
|
||||
|
||||
} /* namespace xmrig */
|
||||
|
||||
|
||||
#endif /* XMRIG_ASM_H */
|
||||
64
src/crypto/cn/CryptoNight.h
Normal file
64
src/crypto/cn/CryptoNight.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_CRYPTONIGHT_H
|
||||
#define XMRIG_CRYPTONIGHT_H
|
||||
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined _MSC_VER || defined XMRIG_ARM
|
||||
# define ABI_ATTRIBUTE
|
||||
#else
|
||||
# define ABI_ATTRIBUTE __attribute__((ms_abi))
|
||||
#endif
|
||||
|
||||
|
||||
struct cryptonight_ctx;
|
||||
typedef void(*cn_mainloop_fun_ms_abi)(cryptonight_ctx**) ABI_ATTRIBUTE;
|
||||
|
||||
|
||||
struct cryptonight_r_data {
|
||||
int variant;
|
||||
uint64_t height;
|
||||
|
||||
bool match(const int v, const uint64_t h) const { return (v == variant) && (h == height); }
|
||||
};
|
||||
|
||||
|
||||
struct cryptonight_ctx {
|
||||
alignas(16) uint8_t state[224];
|
||||
alignas(16) uint8_t *memory;
|
||||
|
||||
uint8_t unused[40];
|
||||
const uint32_t *saes_table;
|
||||
|
||||
cn_mainloop_fun_ms_abi generated_code;
|
||||
cryptonight_r_data generated_code_data;
|
||||
};
|
||||
|
||||
|
||||
#endif /* XMRIG_CRYPTONIGHT_H */
|
||||
844
src/crypto/cn/CryptoNight_arm.h
Normal file
844
src/crypto/cn/CryptoNight_arm.h
Normal file
@@ -0,0 +1,844 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2016 Imran Yusuff <https://github.com/imranyusuff>
|
||||
* Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_CRYPTONIGHT_ARM_H
|
||||
#define XMRIG_CRYPTONIGHT_ARM_H
|
||||
|
||||
|
||||
#include "common/crypto/keccak.h"
|
||||
#include "crypto/common/portable/mm_malloc.h"
|
||||
#include "crypto/cn/CryptoNight_constants.h"
|
||||
#include "crypto/cn/CryptoNight_monero.h"
|
||||
#include "crypto/cn/CryptoNight.h"
|
||||
#include "crypto/cn/soft_aes.h"
|
||||
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "crypto/cn/c_groestl.h"
|
||||
#include "crypto/cn/c_blake256.h"
|
||||
#include "crypto/cn/c_jh.h"
|
||||
#include "crypto/cn/c_skein.h"
|
||||
}
|
||||
|
||||
|
||||
static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) {
|
||||
blake256_hash(output, input, len);
|
||||
}
|
||||
|
||||
|
||||
static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) {
|
||||
groestl(input, len * 8, output);
|
||||
}
|
||||
|
||||
|
||||
static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) {
|
||||
jh_hash(32 * 8, input, 8 * len, output);
|
||||
}
|
||||
|
||||
|
||||
static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) {
|
||||
xmr_skein(input, output);
|
||||
}
|
||||
|
||||
|
||||
void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
|
||||
|
||||
|
||||
static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64_t a, const uint64_t b)
|
||||
{
|
||||
return vcombine_u64(vcreate_u64(b), vcreate_u64(a));
|
||||
}
|
||||
|
||||
|
||||
#if __ARM_FEATURE_CRYPTO
|
||||
static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey)
|
||||
{
|
||||
alignas(16) const __m128i zero = { 0 };
|
||||
return veorq_u8(vaesmcq_u8(vaeseq_u8(v, zero)), rkey );
|
||||
}
|
||||
#else
|
||||
static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey)
|
||||
{
|
||||
alignas(16) const __m128i zero = { 0 };
|
||||
return zero;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* this one was not implemented yet so here it is */
|
||||
static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return vgetq_lane_u64(a, 0);
|
||||
}
|
||||
|
||||
|
||||
#if defined (__arm64__) || defined (__aarch64__)
|
||||
static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
|
||||
{
|
||||
unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
|
||||
*hi = r >> 64;
|
||||
return (uint64_t) r;
|
||||
}
|
||||
#else
|
||||
static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
|
||||
// multiplier = ab = a * 2^32 + b
|
||||
// multiplicand = cd = c * 2^32 + d
|
||||
// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
|
||||
uint64_t a = multiplier >> 32;
|
||||
uint64_t b = multiplier & 0xFFFFFFFF;
|
||||
uint64_t c = multiplicand >> 32;
|
||||
uint64_t d = multiplicand & 0xFFFFFFFF;
|
||||
|
||||
//uint64_t ac = a * c;
|
||||
uint64_t ad = a * d;
|
||||
//uint64_t bc = b * c;
|
||||
uint64_t bd = b * d;
|
||||
|
||||
uint64_t adbc = ad + (b * c);
|
||||
uint64_t adbc_carry = adbc < ad ? 1 : 0;
|
||||
|
||||
// multiplier * multiplicand = product_hi * 2^64 + product_lo
|
||||
uint64_t product_lo = bd + (adbc << 32);
|
||||
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
|
||||
*product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
|
||||
|
||||
return product_lo;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// This will shift and xor tmp1 into itself as 4 32-bit vals such as
|
||||
// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
|
||||
static inline __m128i sl_xor(__m128i tmp1)
|
||||
{
|
||||
__m128i tmp4;
|
||||
tmp4 = _mm_slli_si128(tmp1, 0x04);
|
||||
tmp1 = _mm_xor_si128(tmp1, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
tmp1 = _mm_xor_si128(tmp1, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
tmp1 = _mm_xor_si128(tmp1, tmp4);
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
|
||||
template<uint8_t rcon>
|
||||
static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2)
|
||||
{
|
||||
__m128i xout1 = soft_aeskeygenassist<rcon>(*xout2);
|
||||
xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
|
||||
*xout0 = sl_xor(*xout0);
|
||||
*xout0 = _mm_xor_si128(*xout0, xout1);
|
||||
xout1 = soft_aeskeygenassist<0x00>(*xout0);
|
||||
xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
|
||||
*xout2 = sl_xor(*xout2);
|
||||
*xout2 = _mm_xor_si128(*xout2, xout1);
|
||||
}
|
||||
|
||||
|
||||
template<bool SOFT_AES>
|
||||
static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
|
||||
{
|
||||
__m128i xout0 = _mm_load_si128(memory);
|
||||
__m128i xout2 = _mm_load_si128(memory + 1);
|
||||
*k0 = xout0;
|
||||
*k1 = xout2;
|
||||
|
||||
soft_aes_genkey_sub<0x01>(&xout0, &xout2);
|
||||
*k2 = xout0;
|
||||
*k3 = xout2;
|
||||
|
||||
soft_aes_genkey_sub<0x02>(&xout0, &xout2);
|
||||
*k4 = xout0;
|
||||
*k5 = xout2;
|
||||
|
||||
soft_aes_genkey_sub<0x04>(&xout0, &xout2);
|
||||
*k6 = xout0;
|
||||
*k7 = xout2;
|
||||
|
||||
soft_aes_genkey_sub<0x08>(&xout0, &xout2);
|
||||
*k8 = xout0;
|
||||
*k9 = xout2;
|
||||
}
|
||||
|
||||
|
||||
template<bool SOFT_AES>
|
||||
static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
|
||||
{
|
||||
if (SOFT_AES) {
|
||||
*x0 = soft_aesenc((uint32_t*)x0, key);
|
||||
*x1 = soft_aesenc((uint32_t*)x1, key);
|
||||
*x2 = soft_aesenc((uint32_t*)x2, key);
|
||||
*x3 = soft_aesenc((uint32_t*)x3, key);
|
||||
*x4 = soft_aesenc((uint32_t*)x4, key);
|
||||
*x5 = soft_aesenc((uint32_t*)x5, key);
|
||||
*x6 = soft_aesenc((uint32_t*)x6, key);
|
||||
*x7 = soft_aesenc((uint32_t*)x7, key);
|
||||
}
|
||||
else {
|
||||
*x0 = _mm_aesenc_si128(*x0, key);
|
||||
*x1 = _mm_aesenc_si128(*x1, key);
|
||||
*x2 = _mm_aesenc_si128(*x2, key);
|
||||
*x3 = _mm_aesenc_si128(*x3, key);
|
||||
*x4 = _mm_aesenc_si128(*x4, key);
|
||||
*x5 = _mm_aesenc_si128(*x5, key);
|
||||
*x6 = _mm_aesenc_si128(*x6, key);
|
||||
*x7 = _mm_aesenc_si128(*x7, key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
|
||||
{
|
||||
__m128i tmp0 = x0;
|
||||
x0 = _mm_xor_si128(x0, x1);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_xor_si128(x2, x3);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = _mm_xor_si128(x4, x5);
|
||||
x5 = _mm_xor_si128(x5, x6);
|
||||
x6 = _mm_xor_si128(x6, x7);
|
||||
x7 = _mm_xor_si128(x7, tmp0);
|
||||
}
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, size_t MEM, bool SOFT_AES>
|
||||
static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
||||
{
|
||||
__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
||||
aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
|
||||
|
||||
xin0 = _mm_load_si128(input + 4);
|
||||
xin1 = _mm_load_si128(input + 5);
|
||||
xin2 = _mm_load_si128(input + 6);
|
||||
xin3 = _mm_load_si128(input + 7);
|
||||
xin4 = _mm_load_si128(input + 8);
|
||||
xin5 = _mm_load_si128(input + 9);
|
||||
xin6 = _mm_load_si128(input + 10);
|
||||
xin7 = _mm_load_si128(input + 11);
|
||||
|
||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
|
||||
mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
|
||||
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
|
||||
_mm_store_si128(output + i + 0, xin0);
|
||||
_mm_store_si128(output + i + 1, xin1);
|
||||
_mm_store_si128(output + i + 2, xin2);
|
||||
_mm_store_si128(output + i + 3, xin3);
|
||||
_mm_store_si128(output + i + 4, xin4);
|
||||
_mm_store_si128(output + i + 5, xin5);
|
||||
_mm_store_si128(output + i + 6, xin6);
|
||||
_mm_store_si128(output + i + 7, xin7);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_NO_CN_GPU
|
||||
template<xmrig::Algo ALGO, size_t MEM>
|
||||
void cn_explode_scratchpad_gpu(const uint8_t *input, uint8_t *output)
|
||||
{
|
||||
constexpr size_t hash_size = 200; // 25x8 bytes
|
||||
alignas(16) uint64_t hash[25];
|
||||
|
||||
for (uint64_t i = 0; i < MEM / 512; i++)
|
||||
{
|
||||
memcpy(hash, input, hash_size);
|
||||
hash[0] ^= i;
|
||||
|
||||
xmrig::keccakf(hash, 24);
|
||||
memcpy(output, hash, 160);
|
||||
output += 160;
|
||||
|
||||
xmrig::keccakf(hash, 24);
|
||||
memcpy(output, hash, 176);
|
||||
output += 176;
|
||||
|
||||
xmrig::keccakf(hash, 24);
|
||||
memcpy(output, hash, 176);
|
||||
output += 176;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, size_t MEM, bool SOFT_AES>
|
||||
static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
||||
{
|
||||
__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
||||
aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
|
||||
|
||||
xout0 = _mm_load_si128(output + 4);
|
||||
xout1 = _mm_load_si128(output + 5);
|
||||
xout2 = _mm_load_si128(output + 6);
|
||||
xout3 = _mm_load_si128(output + 7);
|
||||
xout4 = _mm_load_si128(output + 8);
|
||||
xout5 = _mm_load_si128(output + 9);
|
||||
xout6 = _mm_load_si128(output + 10);
|
||||
xout7 = _mm_load_si128(output + 11);
|
||||
|
||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
|
||||
{
|
||||
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
|
||||
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
|
||||
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
|
||||
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
|
||||
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
|
||||
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
|
||||
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
|
||||
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
|
||||
|
||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
|
||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||
mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
|
||||
}
|
||||
}
|
||||
|
||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
|
||||
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
|
||||
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
|
||||
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
|
||||
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
|
||||
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
|
||||
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
|
||||
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
|
||||
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
|
||||
|
||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
|
||||
mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
|
||||
mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
|
||||
}
|
||||
}
|
||||
|
||||
_mm_store_si128(output + 4, xout0);
|
||||
_mm_store_si128(output + 5, xout1);
|
||||
_mm_store_si128(output + 6, xout2);
|
||||
_mm_store_si128(output + 7, xout3);
|
||||
_mm_store_si128(output + 8, xout4);
|
||||
_mm_store_si128(output + 9, xout5);
|
||||
_mm_store_si128(output + 10, xout6);
|
||||
_mm_store_si128(output + 11, xout7);
|
||||
}
|
||||
|
||||
|
||||
static inline __m128i aes_round_tweak_div(const __m128i &in, const __m128i &key)
|
||||
{
|
||||
alignas(16) uint32_t k[4];
|
||||
alignas(16) uint32_t x[4];
|
||||
|
||||
_mm_store_si128((__m128i*) k, key);
|
||||
_mm_store_si128((__m128i*) x, _mm_xor_si128(in, _mm_set_epi64x(0xffffffffffffffff, 0xffffffffffffffff)));
|
||||
|
||||
#define BYTE(p, i) ((unsigned char*)&x[p])[i]
|
||||
k[0] ^= saes_table[0][BYTE(0, 0)] ^ saes_table[1][BYTE(1, 1)] ^ saes_table[2][BYTE(2, 2)] ^ saes_table[3][BYTE(3, 3)];
|
||||
x[0] ^= k[0];
|
||||
k[1] ^= saes_table[0][BYTE(1, 0)] ^ saes_table[1][BYTE(2, 1)] ^ saes_table[2][BYTE(3, 2)] ^ saes_table[3][BYTE(0, 3)];
|
||||
x[1] ^= k[1];
|
||||
k[2] ^= saes_table[0][BYTE(2, 0)] ^ saes_table[1][BYTE(3, 1)] ^ saes_table[2][BYTE(0, 2)] ^ saes_table[3][BYTE(1, 3)];
|
||||
x[2] ^= k[2];
|
||||
k[3] ^= saes_table[0][BYTE(3, 0)] ^ saes_table[1][BYTE(0, 1)] ^ saes_table[2][BYTE(1, 2)] ^ saes_table[3][BYTE(2, 3)];
|
||||
#undef BYTE
|
||||
|
||||
return _mm_load_si128((__m128i*)k);
|
||||
}
|
||||
|
||||
|
||||
template<xmrig::Variant VARIANT, xmrig::Variant BASE>
|
||||
static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx)
|
||||
{
|
||||
uint64_t* mem_out = (uint64_t*)&l[idx];
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
|
||||
_mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx));
|
||||
} else {
|
||||
__m128i tmp = _mm_xor_si128(bx0, cx);
|
||||
mem_out[0] = _mm_cvtsi128_si64(tmp);
|
||||
|
||||
uint64_t vh = vgetq_lane_u64(tmp, 1);
|
||||
|
||||
uint8_t x = vh >> 24;
|
||||
static const uint16_t table = 0x7531;
|
||||
const uint8_t index = (((x >> (VARIANT == xmrig::VARIANT_XTL ? 4 : 3)) & 6) | (x & 1)) << 1;
|
||||
vh ^= ((table >> index) & 0x3) << 28;
|
||||
|
||||
mem_out[1] = vh;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
|
||||
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
|
||||
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
|
||||
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
|
||||
constexpr xmrig::Variant BASE = xmrig::cn_base_variant<VARIANT>();
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 && size < 43) {
|
||||
memset(output, 0, 32);
|
||||
return;
|
||||
}
|
||||
|
||||
xmrig::keccak(input, size, ctx[0]->state);
|
||||
|
||||
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
|
||||
|
||||
const uint8_t* l0 = ctx[0]->memory;
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
|
||||
|
||||
VARIANT1_INIT(0);
|
||||
VARIANT2_INIT(0);
|
||||
VARIANT4_RANDOM_MATH_INIT(0);
|
||||
|
||||
uint64_t al0 = h0[0] ^ h0[4];
|
||||
uint64_t ah0 = h0[1] ^ h0[5];
|
||||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
|
||||
|
||||
uint64_t idx0 = al0;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx;
|
||||
if (VARIANT == xmrig::VARIANT_TUBE || !SOFT_AES) {
|
||||
cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
|
||||
}
|
||||
|
||||
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
if (VARIANT == xmrig::VARIANT_TUBE) {
|
||||
cx = aes_round_tweak_div(cx, ax0);
|
||||
}
|
||||
else if (SOFT_AES) {
|
||||
cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
|
||||
}
|
||||
else {
|
||||
cx = _mm_aesenc_si128(cx, ax0);
|
||||
}
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 || BASE == xmrig::VARIANT_2) {
|
||||
cryptonight_monero_tweak<VARIANT, BASE>(l0, idx0 & MASK, ax0, bx0, bx1, cx);
|
||||
} else {
|
||||
_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
|
||||
}
|
||||
|
||||
idx0 = _mm_cvtsi128_si64(cx);
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
|
||||
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
|
||||
if (VARIANT == xmrig::VARIANT_4) {
|
||||
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
|
||||
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
|
||||
}
|
||||
} else {
|
||||
VARIANT2_INTEGER_MATH(0, cl, cx);
|
||||
}
|
||||
}
|
||||
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
if (VARIANT == xmrig::VARIANT_4) {
|
||||
VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx, 0);
|
||||
} else {
|
||||
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
|
||||
}
|
||||
}
|
||||
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) {
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0;
|
||||
} else if (BASE == xmrig::VARIANT_1) {
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
|
||||
} else {
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
|
||||
}
|
||||
|
||||
al0 ^= cl;
|
||||
ah0 ^= ch;
|
||||
idx0 = al0;
|
||||
|
||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||
const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l0[idx0 & MASK]));
|
||||
const int64_t n = vgetq_lane_s64(x, 0);
|
||||
const int32_t d = vgetq_lane_s32(x, 2);
|
||||
const int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
||||
|
||||
if (VARIANT == xmrig::VARIANT_XHV) {
|
||||
idx0 = (~d) ^ q;
|
||||
}
|
||||
else {
|
||||
idx0 = d ^ q;
|
||||
}
|
||||
}
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
bx1 = bx0;
|
||||
}
|
||||
|
||||
bx0 = cx;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
|
||||
|
||||
xmrig::keccakf(h0, 24);
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_NO_CN_GPU
|
||||
template<size_t ITER, uint32_t MASK>
|
||||
void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad);
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
|
||||
inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr size_t MASK = xmrig::CRYPTONIGHT_GPU_MASK;
|
||||
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
|
||||
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
|
||||
|
||||
static_assert(MASK > 0 && ITERATIONS > 0 && MEM > 0, "unsupported algorithm/variant");
|
||||
|
||||
xmrig::keccak(input, size, ctx[0]->state);
|
||||
cn_explode_scratchpad_gpu<ALGO, MEM>(ctx[0]->state, ctx[0]->memory);
|
||||
|
||||
fesetround(FE_TONEAREST);
|
||||
|
||||
cn_gpu_inner_arm<ITERATIONS, MASK>(ctx[0]->state, ctx[0]->memory);
|
||||
|
||||
cn_implode_scratchpad<xmrig::CRYPTONIGHT_HEAVY, MEM, SOFT_AES>((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
|
||||
|
||||
xmrig::keccakf((uint64_t*) ctx[0]->state, 24);
|
||||
memcpy(output, ctx[0]->state, 32);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
|
||||
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr size_t MASK = xmrig::cn_select_mask<ALGO>();
|
||||
constexpr size_t ITERATIONS = xmrig::cn_select_iter<ALGO, VARIANT>();
|
||||
constexpr size_t MEM = xmrig::cn_select_memory<ALGO>();
|
||||
constexpr xmrig::Variant BASE = xmrig::cn_base_variant<VARIANT>();
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 && size < 43) {
|
||||
memset(output, 0, 64);
|
||||
return;
|
||||
}
|
||||
|
||||
xmrig::keccak(input, size, ctx[0]->state);
|
||||
xmrig::keccak(input + size, size, ctx[1]->state);
|
||||
|
||||
const uint8_t* l0 = ctx[0]->memory;
|
||||
const uint8_t* l1 = ctx[1]->memory;
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
|
||||
uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx[1]->state);
|
||||
|
||||
VARIANT1_INIT(0);
|
||||
VARIANT1_INIT(1);
|
||||
VARIANT2_INIT(0);
|
||||
VARIANT2_INIT(1);
|
||||
VARIANT4_RANDOM_MATH_INIT(0);
|
||||
VARIANT4_RANDOM_MATH_INIT(1);
|
||||
|
||||
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
|
||||
cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
|
||||
|
||||
uint64_t al0 = h0[0] ^ h0[4];
|
||||
uint64_t al1 = h1[0] ^ h1[4];
|
||||
uint64_t ah0 = h0[1] ^ h0[5];
|
||||
uint64_t ah1 = h1[1] ^ h1[5];
|
||||
|
||||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
|
||||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
|
||||
|
||||
uint64_t idx0 = al0;
|
||||
uint64_t idx1 = al1;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx0, cx1;
|
||||
if (VARIANT == xmrig::VARIANT_TUBE || !SOFT_AES) {
|
||||
cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
|
||||
}
|
||||
|
||||
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
const __m128i ax1 = _mm_set_epi64x(ah1, al1);
|
||||
if (VARIANT == xmrig::VARIANT_TUBE) {
|
||||
cx0 = aes_round_tweak_div(cx0, ax0);
|
||||
cx1 = aes_round_tweak_div(cx1, ax1);
|
||||
}
|
||||
else if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
|
||||
}
|
||||
else {
|
||||
cx0 = _mm_aesenc_si128(cx0, ax0);
|
||||
cx1 = _mm_aesenc_si128(cx1, ax1);
|
||||
}
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 || (BASE == xmrig::VARIANT_2)) {
|
||||
cryptonight_monero_tweak<VARIANT, BASE>(l0, idx0 & MASK, ax0, bx00, bx01, cx0);
|
||||
cryptonight_monero_tweak<VARIANT, BASE>(l1, idx1 & MASK, ax1, bx10, bx11, cx1);
|
||||
} else {
|
||||
_mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
|
||||
_mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx10, cx1));
|
||||
}
|
||||
|
||||
idx0 = _mm_cvtsi128_si64(cx0);
|
||||
idx1 = _mm_cvtsi128_si64(cx1);
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
|
||||
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
|
||||
if (VARIANT == xmrig::VARIANT_4) {
|
||||
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
|
||||
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
|
||||
}
|
||||
} else {
|
||||
VARIANT2_INTEGER_MATH(0, cl, cx0);
|
||||
}
|
||||
}
|
||||
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
if (VARIANT == xmrig::VARIANT_4) {
|
||||
VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0, 0);
|
||||
} else {
|
||||
VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
|
||||
}
|
||||
}
|
||||
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) {
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0;
|
||||
} else if (BASE == xmrig::VARIANT_1) {
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
|
||||
} else {
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
|
||||
}
|
||||
|
||||
al0 ^= cl;
|
||||
ah0 ^= ch;
|
||||
idx0 = al0;
|
||||
|
||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||
const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l0[idx0 & MASK]));
|
||||
const int64_t n = vgetq_lane_s64(x, 0);
|
||||
const int32_t d = vgetq_lane_s32(x, 2);
|
||||
const int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
||||
|
||||
if (VARIANT == xmrig::VARIANT_XHV) {
|
||||
idx0 = (~d) ^ q;
|
||||
}
|
||||
else {
|
||||
idx0 = d ^ q;
|
||||
}
|
||||
}
|
||||
|
||||
cl = ((uint64_t*) &l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
|
||||
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
|
||||
if (VARIANT == xmrig::VARIANT_4) {
|
||||
al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
|
||||
ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
|
||||
}
|
||||
} else {
|
||||
VARIANT2_INTEGER_MATH(1, cl, cx1);
|
||||
}
|
||||
}
|
||||
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
if (VARIANT == xmrig::VARIANT_4) {
|
||||
VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1, 0);
|
||||
} else {
|
||||
VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
|
||||
}
|
||||
}
|
||||
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
|
||||
((uint64_t*)&l1[idx1 & MASK])[0] = al1;
|
||||
|
||||
if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) {
|
||||
((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1 ^ al1;
|
||||
} else if (BASE == xmrig::VARIANT_1) {
|
||||
((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1;
|
||||
} else {
|
||||
((uint64_t*)&l1[idx1 & MASK])[1] = ah1;
|
||||
}
|
||||
|
||||
al1 ^= cl;
|
||||
ah1 ^= ch;
|
||||
idx1 = al1;
|
||||
|
||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||
const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l1[idx1 & MASK]));
|
||||
const int64_t n = vgetq_lane_s64(x, 0);
|
||||
const int32_t d = vgetq_lane_s32(x, 2);
|
||||
const int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
|
||||
|
||||
if (VARIANT == xmrig::VARIANT_XHV) {
|
||||
idx1 = (~d) ^ q;
|
||||
}
|
||||
else {
|
||||
idx1 = d ^ q;
|
||||
}
|
||||
}
|
||||
if (BASE == xmrig::VARIANT_2) {
|
||||
bx01 = bx00;
|
||||
bx11 = bx10;
|
||||
}
|
||||
bx00 = cx0;
|
||||
bx10 = cx1;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
|
||||
|
||||
xmrig::keccakf(h0, 24);
|
||||
xmrig::keccakf(h1, 24);
|
||||
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
|
||||
}
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
|
||||
inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
|
||||
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
|
||||
inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* __CRYPTONIGHT_ARM_H__ */
|
||||
251
src/crypto/cn/CryptoNight_constants.h
Normal file
251
src/crypto/cn/CryptoNight_constants.h
Normal file
@@ -0,0 +1,251 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_CRYPTONIGHT_CONSTANTS_H
|
||||
#define XMRIG_CRYPTONIGHT_CONSTANTS_H
|
||||
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#include "common/xmrig.h"
|
||||
|
||||
|
||||
namespace xmrig
|
||||
{
|
||||
|
||||
constexpr const size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024;
|
||||
constexpr const uint32_t CRYPTONIGHT_MASK = 0x1FFFF0;
|
||||
constexpr const uint32_t CRYPTONIGHT_ITER = 0x80000;
|
||||
constexpr const uint32_t CRYPTONIGHT_HALF_ITER = 0x40000;
|
||||
constexpr const uint32_t CRYPTONIGHT_XAO_ITER = 0x100000;
|
||||
constexpr const uint32_t CRYPTONIGHT_DOUBLE_ITER = 0x100000;
|
||||
constexpr const uint32_t CRYPTONIGHT_WALTZ_ITER = 0x60000;
|
||||
constexpr const uint32_t CRYPTONIGHT_ZLS_ITER = 0x60000;
|
||||
|
||||
constexpr const uint32_t CRYPTONIGHT_GPU_ITER = 0xC000;
|
||||
constexpr const uint32_t CRYPTONIGHT_GPU_MASK = 0x1FFFC0;
|
||||
|
||||
constexpr const size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024;
|
||||
constexpr const uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0;
|
||||
constexpr const uint32_t CRYPTONIGHT_LITE_ITER = 0x40000;
|
||||
|
||||
constexpr const size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
|
||||
constexpr const uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0;
|
||||
constexpr const uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
|
||||
|
||||
constexpr const size_t CRYPTONIGHT_PICO_MEMORY = 256 * 1024;
|
||||
constexpr const uint32_t CRYPTONIGHT_PICO_MASK = 0x1FFF0;
|
||||
constexpr const uint32_t CRYPTONIGHT_PICO_ITER = 0x40000;
|
||||
constexpr const uint32_t CRYPTONIGHT_TRTL_ITER = 0x10000;
|
||||
|
||||
|
||||
template<Algo ALGO> inline constexpr size_t cn_select_memory() { return 0; }
|
||||
template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT>() { return CRYPTONIGHT_MEMORY; }
|
||||
template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT_LITE>() { return CRYPTONIGHT_LITE_MEMORY; }
|
||||
template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT_HEAVY>() { return CRYPTONIGHT_HEAVY_MEMORY; }
|
||||
template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT_PICO>() { return CRYPTONIGHT_PICO_MEMORY; }
|
||||
|
||||
|
||||
inline size_t cn_select_memory(Algo algorithm)
|
||||
{
|
||||
switch(algorithm)
|
||||
{
|
||||
case CRYPTONIGHT:
|
||||
return CRYPTONIGHT_MEMORY;
|
||||
|
||||
case CRYPTONIGHT_LITE:
|
||||
return CRYPTONIGHT_LITE_MEMORY;
|
||||
|
||||
case CRYPTONIGHT_HEAVY:
|
||||
return CRYPTONIGHT_HEAVY_MEMORY;
|
||||
|
||||
case CRYPTONIGHT_PICO:
|
||||
return CRYPTONIGHT_PICO_MEMORY;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
template<Algo ALGO> inline constexpr uint32_t cn_select_mask() { return 0; }
|
||||
template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT>() { return CRYPTONIGHT_MASK; }
|
||||
template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT_LITE>() { return CRYPTONIGHT_LITE_MASK; }
|
||||
template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT_HEAVY>() { return CRYPTONIGHT_HEAVY_MASK; }
|
||||
template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT_PICO>() { return CRYPTONIGHT_PICO_MASK; }
|
||||
|
||||
|
||||
inline uint32_t cn_select_mask(Algo algorithm)
|
||||
{
|
||||
switch(algorithm)
|
||||
{
|
||||
case CRYPTONIGHT:
|
||||
return CRYPTONIGHT_MASK;
|
||||
|
||||
case CRYPTONIGHT_LITE:
|
||||
return CRYPTONIGHT_LITE_MASK;
|
||||
|
||||
case CRYPTONIGHT_HEAVY:
|
||||
return CRYPTONIGHT_HEAVY_MASK;
|
||||
|
||||
case CRYPTONIGHT_PICO:
|
||||
return CRYPTONIGHT_PICO_MASK;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
template<Algo ALGO, Variant variant> inline constexpr uint32_t cn_select_iter() { return 0; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_0>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_1>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_2>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_WOW>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_4>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_XTL>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_HALF>() { return CRYPTONIGHT_HALF_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_MSR>() { return CRYPTONIGHT_HALF_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_XAO>() { return CRYPTONIGHT_XAO_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_RTO>() { return CRYPTONIGHT_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_GPU>() { return CRYPTONIGHT_GPU_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_RWZ>() { return CRYPTONIGHT_WALTZ_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_ZLS>() { return CRYPTONIGHT_ZLS_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_DOUBLE>() { return CRYPTONIGHT_DOUBLE_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_LITE, VARIANT_0>() { return CRYPTONIGHT_LITE_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_LITE, VARIANT_1>() { return CRYPTONIGHT_LITE_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_HEAVY, VARIANT_0>() { return CRYPTONIGHT_HEAVY_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_HEAVY, VARIANT_XHV>() { return CRYPTONIGHT_HEAVY_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_HEAVY, VARIANT_TUBE>() { return CRYPTONIGHT_HEAVY_ITER; }
|
||||
template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_PICO, VARIANT_TRTL>() { return CRYPTONIGHT_TRTL_ITER; }
|
||||
|
||||
|
||||
inline uint32_t cn_select_iter(Algo algorithm, Variant variant)
|
||||
{
|
||||
switch (variant) {
|
||||
case VARIANT_MSR:
|
||||
case VARIANT_HALF:
|
||||
return CRYPTONIGHT_HALF_ITER;
|
||||
|
||||
case VARIANT_GPU:
|
||||
return CRYPTONIGHT_GPU_ITER;
|
||||
|
||||
case VARIANT_RTO:
|
||||
case VARIANT_DOUBLE:
|
||||
return CRYPTONIGHT_XAO_ITER;
|
||||
|
||||
case VARIANT_TRTL:
|
||||
return CRYPTONIGHT_TRTL_ITER;
|
||||
|
||||
case VARIANT_RWZ:
|
||||
case VARIANT_ZLS:
|
||||
return CRYPTONIGHT_WALTZ_ITER;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
switch(algorithm)
|
||||
{
|
||||
case CRYPTONIGHT:
|
||||
return CRYPTONIGHT_ITER;
|
||||
|
||||
case CRYPTONIGHT_LITE:
|
||||
return CRYPTONIGHT_LITE_ITER;
|
||||
|
||||
case CRYPTONIGHT_HEAVY:
|
||||
return CRYPTONIGHT_HEAVY_ITER;
|
||||
|
||||
case CRYPTONIGHT_PICO:
|
||||
return CRYPTONIGHT_TRTL_ITER;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
template<Variant variant> inline constexpr Variant cn_base_variant() { return VARIANT_0; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_0>() { return VARIANT_0; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_1>() { return VARIANT_1; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_TUBE>() { return VARIANT_1; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_XTL>() { return VARIANT_1; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_MSR>() { return VARIANT_1; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_XHV>() { return VARIANT_0; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_XAO>() { return VARIANT_0; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_RTO>() { return VARIANT_1; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_2>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_HALF>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_TRTL>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_GPU>() { return VARIANT_GPU; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_WOW>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_4>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_RWZ>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_ZLS>() { return VARIANT_2; }
|
||||
template<> inline constexpr Variant cn_base_variant<VARIANT_DOUBLE>() { return VARIANT_2; }
|
||||
|
||||
|
||||
inline Variant cn_base_variant(Variant variant)
|
||||
{
|
||||
switch (variant) {
|
||||
case VARIANT_0:
|
||||
case VARIANT_XHV:
|
||||
case VARIANT_XAO:
|
||||
return VARIANT_0;
|
||||
|
||||
case VARIANT_1:
|
||||
case VARIANT_TUBE:
|
||||
case VARIANT_XTL:
|
||||
case VARIANT_MSR:
|
||||
case VARIANT_RTO:
|
||||
return VARIANT_1;
|
||||
|
||||
case VARIANT_GPU:
|
||||
return VARIANT_GPU;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return VARIANT_2;
|
||||
}
|
||||
|
||||
|
||||
template<Variant variant> inline constexpr bool cn_is_cryptonight_r() { return false; }
|
||||
template<> inline constexpr bool cn_is_cryptonight_r<VARIANT_WOW>() { return true; }
|
||||
template<> inline constexpr bool cn_is_cryptonight_r<VARIANT_4>() { return true; }
|
||||
|
||||
} /* namespace xmrig */
|
||||
|
||||
|
||||
#endif /* XMRIG_CRYPTONIGHT_CONSTANTS_H */
|
||||
206
src/crypto/cn/CryptoNight_monero.h
Normal file
206
src/crypto/cn/CryptoNight_monero.h
Normal file
@@ -0,0 +1,206 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright 2018 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_CRYPTONIGHT_MONERO_H
|
||||
#define XMRIG_CRYPTONIGHT_MONERO_H
|
||||
|
||||
#include <fenv.h>
|
||||
#include <math.h>
|
||||
|
||||
// VARIANT ALTERATIONS
|
||||
#ifndef XMRIG_ARM
|
||||
# define VARIANT1_INIT(part) \
|
||||
uint64_t tweak1_2_##part = 0; \
|
||||
if (BASE == xmrig::VARIANT_1) { \
|
||||
tweak1_2_##part = (*reinterpret_cast<const uint64_t*>(input + 35 + part * size) ^ \
|
||||
*(reinterpret_cast<const uint64_t*>(ctx[part]->state) + 24)); \
|
||||
}
|
||||
#else
|
||||
# define VARIANT1_INIT(part) \
|
||||
uint64_t tweak1_2_##part = 0; \
|
||||
if (BASE == xmrig::VARIANT_1) { \
|
||||
memcpy(&tweak1_2_##part, input + 35 + part * size, sizeof tweak1_2_##part); \
|
||||
tweak1_2_##part ^= *(reinterpret_cast<const uint64_t*>(ctx[part]->state) + 24); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define VARIANT1_1(p) \
|
||||
if (BASE == xmrig::VARIANT_1) { \
|
||||
const uint8_t tmp = reinterpret_cast<const uint8_t*>(p)[11]; \
|
||||
static const uint32_t table = 0x75310; \
|
||||
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
|
||||
((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
|
||||
}
|
||||
|
||||
#define VARIANT1_2(p, part) \
|
||||
if (BASE == xmrig::VARIANT_1) { \
|
||||
(p) ^= tweak1_2_##part; \
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_ARM
|
||||
# define VARIANT2_INIT(part) \
|
||||
__m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \
|
||||
__m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]);
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# define VARIANT2_SET_ROUNDING_MODE() if (BASE == xmrig::VARIANT_2) { _control87(RC_DOWN, MCW_RC); }
|
||||
#else
|
||||
# define VARIANT2_SET_ROUNDING_MODE() if (BASE == xmrig::VARIANT_2) { fesetround(FE_DOWNWARD); }
|
||||
#endif
|
||||
|
||||
# define VARIANT2_INTEGER_MATH(part, cl, cx) \
|
||||
do { \
|
||||
const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
|
||||
const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
|
||||
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
|
||||
const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
|
||||
const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
|
||||
const uint64_t division_result = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
|
||||
division_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
|
||||
sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
|
||||
} while (0)
|
||||
|
||||
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c, reverse) \
|
||||
do { \
|
||||
const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ (reverse ? 0x30 : 0x10)))); \
|
||||
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
|
||||
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ (reverse ? 0x10 : 0x30)))); \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
|
||||
if (VARIANT == xmrig::VARIANT_4) { \
|
||||
_c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo, reverse) \
|
||||
do { \
|
||||
const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
|
||||
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
|
||||
hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
|
||||
lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
|
||||
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
|
||||
if (reverse) { \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk1, _b1)); \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk3, _b)); \
|
||||
} else { \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
|
||||
} \
|
||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
# define VARIANT2_INIT(part) \
|
||||
uint64_t division_result_##part = h##part[12]; \
|
||||
uint64_t sqrt_result_##part = h##part[13];
|
||||
|
||||
# define VARIANT2_INTEGER_MATH(part, cl, cx) \
|
||||
do { \
|
||||
const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
|
||||
cl ^= division_result_##part ^ (sqrt_result_##part << 32); \
|
||||
const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result_##part << 1)) | 0x80000001UL; \
|
||||
const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
|
||||
division_result_##part = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
|
||||
const uint64_t sqrt_input = cx_0 + division_result_##part; \
|
||||
sqrt_result_##part = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \
|
||||
const uint64_t s = sqrt_result_##part >> 1; \
|
||||
const uint64_t b = sqrt_result_##part & 1; \
|
||||
const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result_##part << 32); \
|
||||
sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \
|
||||
} while (0)
|
||||
|
||||
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c, reverse) \
|
||||
do { \
|
||||
const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ (reverse ? 0x30 : 0x10)))); \
|
||||
const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
|
||||
const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ (reverse ? 0x10 : 0x30)))); \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
|
||||
if (VARIANT == xmrig::VARIANT_4) { \
|
||||
_c = veorq_u64(veorq_u64(_c, chunk3), veorq_u64(chunk1, chunk2)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo, reverse) \
|
||||
do { \
|
||||
const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \
|
||||
const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
|
||||
hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
|
||||
lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
|
||||
const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \
|
||||
if (reverse) { \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b1))); \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b))); \
|
||||
} else { \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
|
||||
} \
|
||||
vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define SWAP32LE(x) x
|
||||
#define SWAP64LE(x) x
|
||||
#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
|
||||
|
||||
#ifndef NOINLINE
|
||||
#ifdef __GNUC__
|
||||
#define NOINLINE __attribute__ ((noinline))
|
||||
#elif _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "common/xmrig.h"
|
||||
#include "crypto/cn/r/variant4_random_math.h"
|
||||
|
||||
#define VARIANT4_RANDOM_MATH_INIT(part) \
|
||||
uint32_t r##part[9]; \
|
||||
struct V4_Instruction code##part[256]; \
|
||||
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
|
||||
r##part[0] = (uint32_t)(h##part[12]); \
|
||||
r##part[1] = (uint32_t)(h##part[12] >> 32); \
|
||||
r##part[2] = (uint32_t)(h##part[13]); \
|
||||
r##part[3] = (uint32_t)(h##part[13] >> 32); \
|
||||
} \
|
||||
v4_random_math_init<VARIANT>(code##part, height);
|
||||
|
||||
#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
|
||||
if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
|
||||
cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
|
||||
r##part[4] = static_cast<uint32_t>(al); \
|
||||
r##part[5] = static_cast<uint32_t>(ah); \
|
||||
r##part[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
|
||||
r##part[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
|
||||
r##part[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
|
||||
v4_random_math(code##part, r##part); \
|
||||
}
|
||||
|
||||
#endif /* XMRIG_CRYPTONIGHT_MONERO_H */
|
||||
388
src/crypto/cn/CryptoNight_test.h
Normal file
388
src/crypto/cn/CryptoNight_test.h
Normal file
@@ -0,0 +1,388 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_CRYPTONIGHT_TEST_H
|
||||
#define XMRIG_CRYPTONIGHT_TEST_H
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
const static uint8_t test_input[380] = {
|
||||
0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
|
||||
0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
|
||||
0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
|
||||
0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
|
||||
0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
|
||||
0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
|
||||
0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
|
||||
0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
|
||||
0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
|
||||
0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
|
||||
0x07, 0x07, 0xB4, 0x87, 0xD0, 0xD6, 0x05, 0x26, 0xE0, 0xC6, 0xDD, 0x9B, 0xC7, 0x18, 0xC3, 0xCF,
|
||||
0x52, 0x04, 0xBD, 0x4F, 0x9B, 0x27, 0xF6, 0x73, 0xB9, 0x3F, 0xEF, 0x7B, 0xB2, 0xF7, 0x2B, 0xBB,
|
||||
0x3F, 0x3E, 0x9C, 0x3E, 0x9D, 0x33, 0x1E, 0xDE, 0xAD, 0xBE, 0xEF, 0x4E, 0x00, 0x91, 0x81, 0x29,
|
||||
0x74, 0xB2, 0x70, 0xE7, 0x6D, 0xD2, 0x2A, 0x5F, 0x52, 0x04, 0x93, 0xE6, 0x18, 0x89, 0x40, 0xD8,
|
||||
0xC6, 0xE3, 0x90, 0x6E, 0xAA, 0x6A, 0xB7, 0xE2, 0x08, 0x7E, 0x78, 0x0E,
|
||||
0x01, 0x00, 0xEE, 0xB2, 0xD1, 0xD6, 0x05, 0xFF, 0x27, 0x7F, 0x26, 0xDB, 0xAA, 0xB2, 0xC9, 0x26,
|
||||
0x30, 0xC6, 0xCF, 0x11, 0x64, 0xEA, 0x6C, 0x8A, 0xE0, 0x98, 0x01, 0xF8, 0x75, 0x4B, 0x49, 0xAF,
|
||||
0x79, 0x70, 0xAE, 0xEE, 0xA7, 0x62, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x47, 0x8C, 0x63, 0xE7, 0xD8,
|
||||
0x40, 0x02, 0x3C, 0xDA, 0xEA, 0x92, 0x52, 0x53, 0xAC, 0xFD, 0xC7, 0x8A, 0x4C, 0x31, 0xB2, 0xF2,
|
||||
0xEC, 0x72, 0x7B, 0xFF, 0xCE, 0xC0, 0xE7, 0x12, 0xD4, 0xE9, 0x2A, 0x01,
|
||||
0x07, 0x07, 0xA9, 0xB7, 0xD1, 0xD6, 0x05, 0x3F, 0x0D, 0x5E, 0xFD, 0xC7, 0x03, 0xFC, 0xFC, 0xD2,
|
||||
0xCE, 0xBC, 0x44, 0xD8, 0xAB, 0x44, 0xA6, 0xA0, 0x3A, 0xE4, 0x4D, 0x8F, 0x15, 0xAF, 0x62, 0x17,
|
||||
0xD1, 0xE0, 0x92, 0x85, 0xE4, 0x73, 0xF9, 0x00, 0x00, 0x00, 0xA0, 0xFC, 0x09, 0xDE, 0xAB, 0xF5,
|
||||
0x8B, 0x6F, 0x1D, 0xCA, 0xA8, 0xBA, 0xAC, 0x74, 0xDD, 0x74, 0x19, 0xD5, 0xD6, 0x10, 0xEC, 0x38,
|
||||
0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04
|
||||
};
|
||||
|
||||
|
||||
struct cn_r_test_input_data
|
||||
{
|
||||
uint64_t height;
|
||||
size_t size;
|
||||
uint8_t data[64];
|
||||
};
|
||||
|
||||
|
||||
const static cn_r_test_input_data cn_r_test_input[] = {
|
||||
{ 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } },
|
||||
{ 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } },
|
||||
{ 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } },
|
||||
{ 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } },
|
||||
{ 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } },
|
||||
{ 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } },
|
||||
{ 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } },
|
||||
{ 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } },
|
||||
{ 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } },
|
||||
{ 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } },
|
||||
};
|
||||
|
||||
|
||||
// "cn/wow"
|
||||
const static uint8_t test_output_wow[] = {
|
||||
0x9d, 0x47, 0xbf, 0x4c, 0x41, 0xb7, 0xe8, 0xe7, 0x27, 0xe6, 0x81, 0x71, 0x5a, 0xcb, 0x47, 0xfa, 0x16, 0x77, 0xcd, 0xba, 0x9c, 0xa7, 0xbc, 0xb0, 0x5a, 0xd8, 0xcc, 0x8a, 0xbd, 0x5d, 0xaa, 0x66,
|
||||
0x0d, 0x4a, 0x49, 0x5c, 0xb8, 0x44, 0xa3, 0xca, 0x8b, 0xa4, 0xed, 0xb8, 0xe6, 0xbc, 0xf8, 0x29, 0xef, 0x1c, 0x06, 0xd9, 0xcd, 0xea, 0x2b, 0x62, 0xca, 0x46, 0xc2, 0xa2, 0x1b, 0x8b, 0x0a, 0x79,
|
||||
0xa1, 0xd6, 0xd8, 0x48, 0xb5, 0xc5, 0x91, 0x5f, 0xcc, 0xd2, 0xf6, 0x4c, 0xf2, 0x16, 0xc6, 0xb1, 0xa0, 0x2c, 0xf7, 0xc7, 0x7b, 0xc8, 0x0d, 0x8d, 0x4e, 0x51, 0xb4, 0x19, 0xe8, 0x8f, 0xf0, 0xdd,
|
||||
0xaf, 0x3a, 0x85, 0x44, 0xa0, 0x22, 0x1a, 0x14, 0x8c, 0x2a, 0xc9, 0x04, 0x84, 0xb1, 0x98, 0x61, 0xe3, 0xaf, 0xca, 0x33, 0xfe, 0x17, 0x02, 0x1e, 0xfb, 0x8a, 0xd6, 0x49, 0x6b, 0x56, 0x79, 0x15,
|
||||
0x31, 0x33, 0x99, 0xe0, 0x96, 0x3a, 0xe8, 0xa9, 0x9d, 0xab, 0x8a, 0xf6, 0x6d, 0x34, 0x3e, 0x09, 0x7d, 0xae, 0x0c, 0x0f, 0xeb, 0x08, 0xdb, 0xc4, 0x3c, 0xcd, 0xaf, 0xef, 0x55, 0x15, 0xf4, 0x13,
|
||||
0x60, 0x21, 0xc6, 0xef, 0x90, 0xbf, 0xf9, 0xae, 0x94, 0xa7, 0x50, 0x6d, 0x62, 0x3d, 0x3a, 0x7a, 0x86, 0xc1, 0x75, 0x6d, 0x65, 0x5f, 0x50, 0xdd, 0x55, 0x8f, 0x71, 0x6d, 0x64, 0x62, 0x2a, 0x34,
|
||||
0x2b, 0x13, 0x00, 0x05, 0x35, 0xf3, 0xdb, 0x5f, 0x9b, 0x9b, 0x84, 0xa6, 0x5c, 0x43, 0x51, 0xf3, 0x86, 0xcd, 0x2c, 0xde, 0xde, 0xbb, 0x8c, 0x3a, 0xd2, 0xea, 0xb0, 0x86, 0xe6, 0xa3, 0xfe, 0xe5,
|
||||
0xfc, 0x0e, 0x1d, 0xad, 0x8e, 0x89, 0x57, 0x49, 0xdc, 0x90, 0xeb, 0x69, 0x0b, 0xc1, 0xba, 0x05, 0x9a, 0x1c, 0xd7, 0x72, 0xaf, 0xaa, 0xf6, 0x5a, 0x10, 0x6b, 0xf9, 0xe5, 0xe6, 0xb8, 0x05, 0x03,
|
||||
0xb6, 0x0b, 0x0a, 0xfe, 0x14, 0x4d, 0xef, 0xf7, 0xd9, 0x03, 0xed, 0x2d, 0x55, 0x45, 0xe7, 0x7e, 0xbe, 0x66, 0xa3, 0xc5, 0x1f, 0xee, 0x70, 0x16, 0xee, 0xb8, 0xfe, 0xe9, 0xeb, 0x63, 0x0c, 0x0f,
|
||||
0x64, 0x77, 0x4b, 0x27, 0xe7, 0xd5, 0xfe, 0xc8, 0x62, 0xfc, 0x4c, 0x0c, 0x13, 0xac, 0x6b, 0xf0, 0x91, 0x23, 0xb6, 0xf0, 0x5b, 0xb0, 0xe4, 0xb7, 0x5c, 0x97, 0xf3, 0x79, 0xa2, 0xb3, 0xa6, 0x79,
|
||||
};
|
||||
|
||||
|
||||
// "cn/r"
|
||||
const static uint8_t test_output_r[] = {
|
||||
0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc,
|
||||
0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66,
|
||||
0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab,
|
||||
0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb,
|
||||
0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02,
|
||||
0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef,
|
||||
0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd,
|
||||
0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4,
|
||||
0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3,
|
||||
0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e,
|
||||
};
|
||||
|
||||
|
||||
// "cn/0"
|
||||
const static uint8_t test_output_v0[160] = {
|
||||
0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
|
||||
0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
|
||||
0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
|
||||
0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F,
|
||||
0xA1, 0xB4, 0xFA, 0xE3, 0xE5, 0x76, 0xCE, 0xCF, 0xB7, 0x9C, 0xAF, 0x3E, 0x29, 0x92, 0xE4, 0xE0,
|
||||
0x31, 0x24, 0x05, 0x48, 0xBF, 0x8D, 0x5F, 0x7B, 0x11, 0x03, 0x60, 0xAA, 0xD7, 0x50, 0x3F, 0x0C,
|
||||
0x2D, 0x30, 0xF3, 0x87, 0x4F, 0x86, 0xA1, 0x4A, 0xB5, 0xA2, 0x1A, 0x08, 0xD0, 0x44, 0x2C, 0x9D,
|
||||
0x16, 0xE9, 0x28, 0x49, 0xA1, 0xFF, 0x85, 0x6F, 0x12, 0xBB, 0x7D, 0xAB, 0x11, 0x1C, 0xE7, 0xF7,
|
||||
0x2D, 0x9D, 0x19, 0xE4, 0xD2, 0x26, 0x44, 0x1E, 0xCD, 0x22, 0x08, 0x24, 0xA8, 0x97, 0x46, 0x62,
|
||||
0x04, 0x84, 0x90, 0x4A, 0xEE, 0x99, 0x14, 0xED, 0xB8, 0xC6, 0x0D, 0x37, 0xA1, 0x66, 0x17, 0xB0
|
||||
};
|
||||
|
||||
|
||||
// "cn/1" Cryptonight variant 1 (Monero v7)
|
||||
const static uint8_t test_output_v1[160] = {
|
||||
0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
|
||||
0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
|
||||
0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
|
||||
0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
|
||||
0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98,
|
||||
0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C,
|
||||
0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE,
|
||||
0x92, 0x2C, 0xD6, 0x0C, 0x46, 0x9D, 0x9B, 0xC2, 0x84, 0x52, 0x65, 0xF6, 0xBD, 0xFA, 0x0D, 0x74,
|
||||
0x00, 0x66, 0x10, 0x07, 0xF1, 0x19, 0x06, 0x3A, 0x6C, 0xFF, 0xEE, 0xB2, 0x40, 0xE5, 0x88, 0x2B,
|
||||
0x6C, 0xAB, 0x6B, 0x1D, 0x88, 0xB8, 0x44, 0x25, 0xF4, 0xEA, 0xB7, 0xEC, 0xBA, 0x12, 0x8A, 0x24
|
||||
};
|
||||
|
||||
|
||||
// "cn/2" Cryptonight variant 2 (Monero v8)
|
||||
const static uint8_t test_output_v2[160] = {
|
||||
0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14,
|
||||
0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21,
|
||||
0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89,
|
||||
0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78,
|
||||
0xE6, 0x0D, 0x24, 0x0F, 0x65, 0x85, 0x60, 0x3A, 0x4A, 0xE5, 0x5F, 0x54, 0x9B, 0xC8, 0x79, 0x93,
|
||||
0xEB, 0x3D, 0x98, 0x2C, 0xFE, 0x9B, 0xFB, 0x15, 0xB6, 0x88, 0x21, 0x94, 0xB0, 0x05, 0x86, 0x5C,
|
||||
0x59, 0x8B, 0x93, 0x7A, 0xDA, 0xD2, 0xA2, 0x14, 0xED, 0xB7, 0xC4, 0x5D, 0xA1, 0xEF, 0x26, 0xF3,
|
||||
0xC7, 0x73, 0x29, 0x4D, 0xF1, 0xC8, 0x2C, 0xE0, 0xD0, 0xE9, 0xED, 0x0C, 0x70, 0x75, 0x05, 0x3E,
|
||||
0x5B, 0xF6, 0xA0, 0x6E, 0xEA, 0xDE, 0x87, 0x0B, 0x06, 0x29, 0x03, 0xBF, 0xB4, 0x85, 0x9D, 0x04,
|
||||
0x75, 0x1A, 0xCD, 0x1E, 0xD6, 0xAA, 0x1B, 0x05, 0x24, 0x6A, 0x2C, 0x80, 0x69, 0x68, 0xDC, 0x97
|
||||
};
|
||||
|
||||
|
||||
// "cn/xtl" Stellite (XTL)
|
||||
const static uint8_t test_output_xtl[160] = {
|
||||
0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3,
|
||||
0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1,
|
||||
0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90,
|
||||
0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF,
|
||||
0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3,
|
||||
0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86,
|
||||
0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D,
|
||||
0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF,
|
||||
0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA,
|
||||
0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48
|
||||
};
|
||||
|
||||
|
||||
// "cn/half"
|
||||
const static uint8_t test_output_half[160] = {
|
||||
0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD,
|
||||
0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7,
|
||||
0x02, 0xE6, 0x1D, 0x2B, 0xBC, 0x84, 0xB6, 0x71, 0x96, 0x71, 0xD5, 0x0C, 0xAC, 0x76, 0x0E, 0x6B,
|
||||
0xF1, 0xF0, 0x55, 0x34, 0x15, 0x29, 0x93, 0x04, 0x2D, 0xED, 0xD2, 0x33, 0x50, 0x6E, 0xBE, 0x25,
|
||||
0xD0, 0xFD, 0x8E, 0xC6, 0x15, 0xD5, 0x12, 0x53, 0x7B, 0x26, 0xF6, 0x01, 0xA5, 0xA8, 0xBE, 0x7C,
|
||||
0xCF, 0x5E, 0x19, 0xB7, 0x63, 0x0D, 0x0F, 0x02, 0x2B, 0xD7, 0xC4, 0x8C, 0x12, 0x24, 0x80, 0x02,
|
||||
0xE7, 0xB7, 0xA0, 0x4F, 0x94, 0xF9, 0x46, 0xB5, 0x18, 0x64, 0x7E, 0x4E, 0x9C, 0x81, 0x6C, 0x60,
|
||||
0x7D, 0x2E, 0xEA, 0xCF, 0x90, 0xCB, 0x68, 0x09, 0xC9, 0x53, 0xF6, 0xA9, 0xCA, 0x0C, 0xAC, 0xDC,
|
||||
0xFD, 0x07, 0xDA, 0x24, 0x1D, 0xD1, 0x35, 0x32, 0x3C, 0xE8, 0x64, 0x44, 0x5E, 0xCB, 0xB5, 0x00,
|
||||
0x69, 0xF4, 0x6F, 0xBB, 0x62, 0x0D, 0x25, 0xD8, 0xAC, 0x20, 0x90, 0xC5, 0x1B, 0xD3, 0x5F, 0xCA
|
||||
};
|
||||
|
||||
|
||||
// "cn/msr" Masari (MSR)
|
||||
const static uint8_t test_output_msr[160] = {
|
||||
0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C,
|
||||
0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5,
|
||||
0x69, 0xDF, 0x38, 0x51, 0x1B, 0xB3, 0xEB, 0x7D, 0xE7, 0x6B, 0x08, 0x8E, 0xB6, 0x7E, 0xB7, 0x1C,
|
||||
0x5F, 0x3C, 0x81, 0xC9, 0xF7, 0xCE, 0xAE, 0x28, 0xC0, 0xFE, 0xEB, 0xBA, 0x0B, 0x40, 0x38, 0x1D,
|
||||
0x44, 0xD0, 0xD5, 0xD3, 0x98, 0x1F, 0xA3, 0x0E, 0xE9, 0x89, 0x1A, 0xD7, 0x88, 0xCC, 0x25, 0x76,
|
||||
0x9C, 0xFF, 0x4D, 0x7F, 0x9C, 0xCF, 0x48, 0x07, 0x91, 0xF9, 0x82, 0xF5, 0x4C, 0xE9, 0xBD, 0x82,
|
||||
0x36, 0x36, 0x64, 0x14, 0xED, 0xB8, 0x54, 0xEE, 0x22, 0xA1, 0x66, 0xA3, 0x87, 0x10, 0x76, 0x1F,
|
||||
0x5A, 0xCD, 0x4C, 0x31, 0x4C, 0xBA, 0x41, 0xD2, 0xDB, 0x6C, 0x31, 0x2E, 0x7A, 0x64, 0x15, 0xFF,
|
||||
0xA6, 0xD9, 0xB9, 0x7D, 0x1C, 0x3C, 0x98, 0xDD, 0x16, 0xE6, 0xD3, 0xAA, 0xEF, 0xB6, 0xB3, 0x53,
|
||||
0x74, 0xD1, 0xAC, 0x5C, 0x04, 0x26, 0x7D, 0x71, 0xDE, 0xAB, 0x66, 0x28, 0x91, 0x3A, 0x6F, 0x4F
|
||||
};
|
||||
|
||||
|
||||
// "cn/xao" Alloy (XAO)
|
||||
const static uint8_t test_output_xao[160] = {
|
||||
0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C,
|
||||
0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33,
|
||||
0xF1, 0xC9, 0x98, 0x40, 0x42, 0xDE, 0x39, 0xD1, 0xBA, 0x2D, 0xAD, 0xEC, 0xFE, 0xEA, 0xD8, 0x46,
|
||||
0x56, 0x1C, 0x32, 0x90, 0x42, 0x63, 0x10, 0x80, 0xD7, 0x01, 0xE4, 0xE6, 0x20, 0xB3, 0x60, 0x45,
|
||||
0x05, 0xE5, 0xC2, 0x18, 0xCD, 0x07, 0xA4, 0x40, 0x42, 0x91, 0xE2, 0xA4, 0x52, 0x54, 0x79, 0xBA,
|
||||
0xCD, 0x7E, 0x61, 0x2D, 0x7F, 0x7E, 0x69, 0x5E, 0xD7, 0xC0, 0x06, 0x65, 0xD7, 0xA1, 0xB8, 0xB8,
|
||||
0x1E, 0x31, 0x1C, 0xD3, 0xB7, 0xBC, 0x78, 0x3C, 0x01, 0xAF, 0x77, 0xAA, 0xF3, 0x0F, 0x4C, 0xF2,
|
||||
0xD1, 0x8B, 0x58, 0xC7, 0xEB, 0x99, 0x91, 0x53, 0x43, 0x71, 0x47, 0x99, 0x9E, 0x04, 0xA4, 0xEA,
|
||||
0xB8, 0xA3, 0xB0, 0x9E, 0x09, 0xF5, 0x57, 0x5C, 0xCF, 0x8A, 0xC6, 0xCA, 0x88, 0x51, 0x9A, 0x01,
|
||||
0x31, 0xCC, 0x0C, 0xA6, 0x53, 0xB5, 0x5F, 0xFD, 0x7D, 0x29, 0x3A, 0x35, 0xE9, 0x0E, 0x25, 0x6C
|
||||
};
|
||||
|
||||
|
||||
// "cn/rto" Arto (RTO)
|
||||
const static uint8_t test_output_rto[160] = {
|
||||
0x82, 0x66, 0x1E, 0x1C, 0x6E, 0x64, 0x36, 0x66, 0x84, 0x06, 0x32, 0x7A, 0x9B, 0xB1, 0x13, 0x19,
|
||||
0xA5, 0x56, 0x16, 0x15, 0xDF, 0xEC, 0x1C, 0x9E, 0xE3, 0x88, 0x4A, 0x6C, 0x1C, 0xEB, 0x76, 0xA5,
|
||||
0xB3, 0xFB, 0xF4, 0x3F, 0x2B, 0x6A, 0x3A, 0x39, 0xA3, 0x6E, 0x08, 0x33, 0x67, 0x90, 0x31, 0xB9,
|
||||
0x3F, 0x27, 0xE4, 0x79, 0x32, 0x61, 0x6B, 0x5C, 0x8A, 0xF8, 0xAF, 0xC0, 0x60, 0xFD, 0x83, 0xB7,
|
||||
0x11, 0x11, 0x89, 0xB4, 0xDC, 0xAE, 0x40, 0xC8, 0x64, 0xAA, 0x4D, 0x19, 0x23, 0x7B, 0xD3, 0x27,
|
||||
0xB2, 0x0F, 0xA7, 0x50, 0x7D, 0xCA, 0xF5, 0x03, 0x06, 0xB2, 0x26, 0x62, 0xF3, 0x68, 0x2D, 0x30,
|
||||
0x6F, 0x93, 0x1E, 0xFF, 0xCD, 0x85, 0x40, 0x28, 0x5F, 0xC3, 0x8C, 0x76, 0x51, 0x9E, 0xD5, 0x06,
|
||||
0x32, 0xD6, 0x35, 0x83, 0xF6, 0x3B, 0x54, 0x4F, 0xA1, 0x9C, 0x13, 0xD8, 0xC4, 0x0E, 0x01, 0x2F,
|
||||
0x29, 0xDB, 0x8C, 0x1C, 0xB7, 0x06, 0x86, 0x79, 0x6D, 0xFF, 0x9F, 0x89, 0x3B, 0x3A, 0xA5, 0x79,
|
||||
0xE7, 0x81, 0x4E, 0x2A, 0xBD, 0x62, 0xC1, 0x1B, 0x7C, 0xB9, 0x33, 0x7B, 0xEE, 0x95, 0x80, 0xB3
|
||||
};
|
||||
|
||||
// "cn/rwz"
|
||||
const static uint8_t test_output_rwz[160] = {
|
||||
0x5f, 0x56, 0xc6, 0xb0, 0x99, 0x6b, 0xa2, 0x3e, 0x0b, 0xba, 0x07, 0x29, 0xc9, 0x90, 0x74, 0x85,
|
||||
0x5a, 0x10, 0xe3, 0x08, 0x7f, 0xdb, 0xfe, 0x94, 0x75, 0x33, 0x54, 0x73, 0x76, 0xf0, 0x75, 0xb8,
|
||||
0x8b, 0x70, 0x43, 0x9a, 0xfc, 0xf5, 0xeb, 0x15, 0xbb, 0xf9, 0xad, 0x9d, 0x2a, 0xbd, 0x72, 0x52,
|
||||
0x49, 0x54, 0x0b, 0x91, 0xea, 0x61, 0x7f, 0x98, 0x7d, 0x39, 0x17, 0xb7, 0xd7, 0x65, 0xff, 0x75,
|
||||
0x13, 0x21, 0x1d, 0xce, 0x61, 0x5a, 0xdc, 0x5f, 0x8c, 0xcb, 0x1f, 0x6f, 0xbb, 0x92, 0x88, 0xc3,
|
||||
0xe3, 0xe2, 0xfc, 0x4f, 0x62, 0xfb, 0xf0, 0x48, 0x02, 0x01, 0xd3, 0xbe, 0x77, 0x6a, 0x40, 0xca,
|
||||
0x9a, 0xe9, 0xba, 0x0c, 0xc0, 0x2b, 0x11, 0xf6, 0x9b, 0xee, 0x24, 0x3a, 0xd8, 0x86, 0x18, 0xd0,
|
||||
0xe8, 0xeb, 0xcb, 0x38, 0x2c, 0xf5, 0x99, 0x83, 0x14, 0x7b, 0x0c, 0x20, 0xbe, 0x50, 0xf4, 0x87,
|
||||
0x83, 0x41, 0x75, 0xd8, 0xd1, 0xdd, 0x4b, 0x73, 0xb3, 0x92, 0x8f, 0xe6, 0x1c, 0x72, 0x70, 0xf5,
|
||||
0x7c, 0xf6, 0x23, 0x3a, 0xb4, 0x5f, 0xdf, 0xde, 0xa6, 0x5a, 0x58, 0xec, 0x13, 0x5a, 0x23, 0x2f
|
||||
};
|
||||
|
||||
// "cn/zls"
|
||||
const static uint8_t test_output_zls[160] = {
|
||||
0x51, 0x6E, 0x33, 0xC6, 0xE4, 0x46, 0xAB, 0xBC, 0xCD, 0xAD, 0x18, 0xC0, 0x4C, 0xD9, 0xA2, 0x5E,
|
||||
0x64, 0x10, 0x28, 0x53, 0xB2, 0x0A, 0x42, 0xDF, 0xDE, 0xAA, 0x8B, 0x59, 0x9E, 0xCF, 0x40, 0xE2,
|
||||
0x0D, 0x62, 0x5B, 0x42, 0x18, 0xE2, 0x76, 0xAD, 0xD0, 0x74, 0x90, 0x60, 0x8D, 0xC4, 0xC7, 0x80,
|
||||
0x17, 0xB5, 0x1B, 0x25, 0x31, 0x39, 0x87, 0xD2, 0x2D, 0x6A, 0x9D, 0x1C, 0x74, 0xF4, 0x43, 0x22,
|
||||
0x4B, 0x97, 0x1F, 0x6A, 0xD0, 0xBE, 0x00, 0x74, 0xEC, 0xC5, 0xD8, 0x3B, 0xE6, 0xF4, 0x03, 0x8A,
|
||||
0x7B, 0xBA, 0x80, 0xCC, 0x9F, 0x00, 0xCB, 0xC2, 0x14, 0x8F, 0xF3, 0xD8, 0x92, 0x73, 0xBF, 0x17,
|
||||
0x3D, 0x9B, 0x22, 0xA3, 0x61, 0x94, 0x41, 0x9E, 0xF9, 0x68, 0x1D, 0x42, 0x48, 0x3B, 0x39, 0x45,
|
||||
0xE2, 0xE6, 0x16, 0x84, 0xFC, 0x21, 0xE6, 0xDA, 0x38, 0x7F, 0x17, 0xAB, 0xD3, 0xF2, 0xCE, 0x1A,
|
||||
0x2F, 0x35, 0xD5, 0x74, 0xFA, 0x45, 0x3B, 0x06, 0xD1, 0x4E, 0x84, 0x3A, 0x5D, 0xE3, 0x0E, 0xA5,
|
||||
0x00, 0x08, 0x64, 0xF0, 0xA6, 0xC8, 0x94, 0x45, 0x08, 0xED, 0x03, 0x95, 0x52, 0xE9, 0xBC, 0x5F
|
||||
};
|
||||
|
||||
// "cn/double"
|
||||
const static uint8_t test_output_double[160] = {
|
||||
0xAE, 0xFB, 0xB3, 0xF0, 0xCC, 0x88, 0x04, 0x6D, 0x11, 0x9F, 0x6C, 0x54, 0xB9, 0x6D, 0x90, 0xC9,
|
||||
0xE8, 0x84, 0xEA, 0x3B, 0x59, 0x83, 0xA6, 0x0D, 0x50, 0xA4, 0x2D, 0x7D, 0x3E, 0xBE, 0x48, 0x21,
|
||||
0x49, 0xCE, 0x8E, 0xF3, 0xBC, 0x8A, 0x36, 0xBF, 0x86, 0x37, 0x89, 0x55, 0x09, 0xBA, 0x22, 0xF8,
|
||||
0xEB, 0x3A, 0xE1, 0xDC, 0x91, 0xF7, 0x62, 0x4B, 0x9F, 0x48, 0xE6, 0x92, 0xBD, 0xE4, 0x5D, 0xC1,
|
||||
0xF1, 0x3C, 0x63, 0x1D, 0xEB, 0x0B, 0x04, 0xA3, 0x30, 0xD5, 0x11, 0x15, 0x4C, 0xCE, 0xEF, 0x4F,
|
||||
0xDF, 0x69, 0xE3, 0x9E, 0xD2, 0x68, 0xFC, 0x1B, 0x6F, 0xE8, 0x08, 0x9C, 0xBB, 0xA5, 0x2B, 0x60,
|
||||
0x52, 0x0F, 0xE5, 0xD2, 0xF3, 0x8A, 0xB3, 0xE1, 0x76, 0x7F, 0x44, 0x25, 0x76, 0xEC, 0xFF, 0xA2,
|
||||
0x0C, 0x64, 0xD0, 0x0E, 0x32, 0x33, 0x28, 0x20, 0x73, 0xE0, 0x31, 0x66, 0x4E, 0x54, 0x83, 0x49,
|
||||
0x51, 0x55, 0x4D, 0x2E, 0x22, 0xB7, 0x51, 0x09, 0x73, 0x61, 0x7E, 0x6A, 0x57, 0x0B, 0x28, 0x3C,
|
||||
0x5E, 0x2E, 0xC1, 0x80, 0x89, 0x39, 0xB3, 0x54, 0x39, 0x52, 0x0E, 0x69, 0x3D, 0xF6, 0xC5, 0x4A
|
||||
};
|
||||
|
||||
#ifndef XMRIG_NO_AEON
|
||||
// "cn-lite/0"
|
||||
const static uint8_t test_output_v0_lite[160] = {
|
||||
0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
|
||||
0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
|
||||
0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
|
||||
0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD,
|
||||
0x38, 0x08, 0xE1, 0x17, 0x0B, 0x99, 0x8D, 0x1A, 0x3C, 0xCE, 0x35, 0xC5, 0xC7, 0x3A, 0x00, 0x2E,
|
||||
0xCB, 0x54, 0xF0, 0x78, 0x2E, 0x9E, 0xDB, 0xC7, 0xDF, 0x2E, 0x71, 0x9A, 0x16, 0x97, 0xC4, 0x18,
|
||||
0x4B, 0x97, 0x07, 0xFE, 0x5D, 0x98, 0x9A, 0xD6, 0xD8, 0xE5, 0x92, 0x66, 0x87, 0x7F, 0x19, 0x37,
|
||||
0xA2, 0x5E, 0xE6, 0x96, 0xB5, 0x97, 0x33, 0x89, 0xE0, 0xA7, 0xC9, 0xDD, 0x4A, 0x7E, 0x9E, 0x53,
|
||||
0xBE, 0x91, 0x2B, 0xF5, 0xF5, 0xAF, 0xDD, 0x09, 0xA2, 0xF4, 0xA4, 0x56, 0xEB, 0x96, 0x22, 0xC9,
|
||||
0x94, 0xFB, 0x7B, 0x28, 0xC9, 0x97, 0x65, 0x04, 0xAC, 0x4F, 0x84, 0x71, 0xDA, 0x6E, 0xD8, 0xC5
|
||||
};
|
||||
|
||||
|
||||
// "cn-lite/1" AEON v7
|
||||
const static uint8_t test_output_v1_lite[160] = {
|
||||
0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
|
||||
0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
|
||||
0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
|
||||
0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F,
|
||||
0x16, 0x08, 0x74, 0xC7, 0xA2, 0xD2, 0xA3, 0x97, 0x95, 0x76, 0xCA, 0x4D, 0x06, 0x39, 0x7A, 0xAB,
|
||||
0x6C, 0x87, 0x58, 0x33, 0x4D, 0xC8, 0x5A, 0xAB, 0x04, 0x27, 0xFE, 0x8B, 0x1C, 0x23, 0x2F, 0x32,
|
||||
0xC0, 0x44, 0xFF, 0x0D, 0xB5, 0x3B, 0x27, 0x96, 0x06, 0x89, 0x7B, 0xA3, 0x0B, 0xD0, 0xCE, 0x9E,
|
||||
0x90, 0x22, 0x77, 0x5A, 0xAD, 0xA1, 0xE5, 0xB6, 0xFC, 0xCB, 0x39, 0x7E, 0x2B, 0x10, 0xEE, 0xB4,
|
||||
0x8C, 0x2B, 0xA4, 0x1F, 0x60, 0x76, 0x39, 0xD7, 0xF6, 0x46, 0x77, 0x18, 0x20, 0xAD, 0xD4, 0xC9,
|
||||
0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef XMRIG_NO_SUMO
|
||||
// "cn-heavy/0"
|
||||
const static uint8_t test_output_v0_heavy[160] = {
|
||||
0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
|
||||
0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2,
|
||||
0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A,
|
||||
0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D,
|
||||
0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7,
|
||||
0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD,
|
||||
0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1,
|
||||
0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9,
|
||||
0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51,
|
||||
0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB
|
||||
};
|
||||
|
||||
|
||||
// "cn-heavy/xhv"
|
||||
const static uint8_t test_output_xhv_heavy[160] = {
|
||||
0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57,
|
||||
0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6,
|
||||
0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F,
|
||||
0xDE, 0x18, 0x29, 0x8E, 0xBB, 0x34, 0x2B, 0xEF, 0x7A, 0x04, 0x22, 0xD1, 0xB1, 0xF2, 0x48, 0xDA,
|
||||
0xE3, 0x7F, 0x4B, 0x4C, 0xB4, 0xDF, 0xE8, 0xD3, 0x70, 0xE2, 0xE7, 0x44, 0x25, 0x87, 0x12, 0xF9,
|
||||
0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B,
|
||||
0xED, 0x2F, 0xE6, 0x0F, 0x39, 0x05, 0xB4, 0x4A, 0x8F, 0x38, 0x44, 0x2D, 0x4B, 0xE9, 0x7B, 0x81,
|
||||
0xC6, 0xB0, 0xE0, 0x0A, 0x39, 0x8C, 0x38, 0xFE, 0x63, 0x31, 0x47, 0x65, 0x0D, 0x2B, 0xF4, 0x96,
|
||||
0x13, 0x91, 0x89, 0xB4, 0x5B, 0xA9, 0x2A, 0x7A, 0x09, 0x65, 0x14, 0x20, 0x76, 0x24, 0x6C, 0x80,
|
||||
0x1D, 0x3F, 0x9F, 0xCD, 0x68, 0x39, 0xA9, 0x42, 0x27, 0xC1, 0x0C, 0x53, 0x98, 0x35, 0x60, 0x7A
|
||||
};
|
||||
|
||||
|
||||
// "cn-heavy/tube"
|
||||
const static uint8_t test_output_tube_heavy[160] = {
|
||||
0xFE, 0x53, 0x35, 0x20, 0x76, 0xEA, 0xE6, 0x89, 0xFA, 0x3B, 0x4F, 0xDA, 0x61, 0x46, 0x34, 0xCF,
|
||||
0xC3, 0x12, 0xEE, 0x0C, 0x38, 0x7D, 0xF2, 0xB8, 0xB7, 0x4D, 0xA2, 0xA1, 0x59, 0x74, 0x12, 0x35,
|
||||
0xCD, 0x3F, 0x29, 0xDF, 0x07, 0x4A, 0x14, 0xAD, 0x0B, 0x98, 0x99, 0x37, 0xCA, 0x14, 0x68, 0xA3,
|
||||
0x8D, 0xAE, 0x86, 0xC1, 0xA3, 0x54, 0x05, 0xBE, 0xEA, 0x6D, 0x29, 0x24, 0x0C, 0x82, 0x97, 0x74,
|
||||
0xA0, 0x64, 0x77, 0xCD, 0x8D, 0x8A, 0xC3, 0x10, 0xB4, 0x89, 0x0E, 0xBB, 0x7D, 0xE6, 0x32, 0x8F,
|
||||
0xF4, 0x2D, 0xB6, 0x9E, 0x8A, 0xF9, 0xF8, 0xEE, 0x2C, 0xD0, 0x74, 0xED, 0xA9, 0xAA, 0xA1, 0xFB,
|
||||
0xE2, 0xC9, 0x89, 0x66, 0xD6, 0x66, 0x52, 0xA2, 0x16, 0xDA, 0x36, 0xA0, 0x10, 0x62, 0xD2, 0xB1,
|
||||
0x76, 0xD1, 0x31, 0xE9, 0x1C, 0x08, 0xB6, 0xCA, 0xAF, 0x89, 0xB9, 0x3D, 0x2C, 0xFA, 0x9A, 0x30,
|
||||
0x74, 0x6A, 0x96, 0xA1, 0x95, 0x6C, 0xBB, 0x46, 0x4D, 0xE0, 0xEB, 0x28, 0xBE, 0x2A, 0x8C, 0x34,
|
||||
0x57, 0x79, 0xBE, 0x52, 0xFB, 0xBC, 0x68, 0x43, 0x45, 0xF4, 0xDF, 0xA5, 0xA8, 0xFD, 0x55, 0xA6
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef XMRIG_NO_CN_PICO
|
||||
// "cn-pico/trtl"
|
||||
const static uint8_t test_output_pico_trtl[160] = {
|
||||
0x08, 0xF4, 0x21, 0xD7, 0x83, 0x31, 0x17, 0x30, 0x0E, 0xDA, 0x66, 0xE9, 0x8F, 0x4A, 0x25, 0x69,
|
||||
0x09, 0x3D, 0xF3, 0x00, 0x50, 0x01, 0x73, 0x94, 0x4E, 0xFC, 0x40, 0x1E, 0x9A, 0x4A, 0x17, 0xAF,
|
||||
0xB2, 0x17, 0x2E, 0xC9, 0x46, 0x6E, 0x1A, 0xEE, 0x70, 0xEC, 0x85, 0x72, 0xA1, 0x4C, 0x23, 0x3E,
|
||||
0xE3, 0x54, 0x58, 0x2B, 0xCB, 0x93, 0xF8, 0x69, 0xD4, 0x29, 0x74, 0x4D, 0xE5, 0x72, 0x6A, 0x26,
|
||||
0x4E, 0xFD, 0x28, 0xFC, 0xD3, 0x74, 0x8A, 0x83, 0xF3, 0xCA, 0x92, 0x84, 0xE7, 0x4E, 0x10, 0xC2,
|
||||
0x05, 0x62, 0xC7, 0xBE, 0x99, 0x73, 0xED, 0x90, 0xB5, 0x6F, 0xDA, 0x64, 0x71, 0x2D, 0x99, 0x39,
|
||||
0x29, 0xDB, 0x22, 0x2B, 0x97, 0xB6, 0x37, 0x0E, 0x9A, 0x03, 0x65, 0xCC, 0xF7, 0xD0, 0x9A, 0xB7,
|
||||
0x68, 0xCE, 0x07, 0x3E, 0x15, 0x40, 0x3C, 0xCE, 0x8C, 0x63, 0x16, 0x72, 0xB5, 0x74, 0x84, 0xF4,
|
||||
0xA1, 0xE7, 0x53, 0x85, 0xFB, 0x72, 0xDD, 0x75, 0x90, 0x39, 0xB2, 0x3D, 0xC3, 0x08, 0x2C, 0xD5,
|
||||
0x01, 0x08, 0x27, 0x75, 0x86, 0xB9, 0xBB, 0x9B, 0xDF, 0xEA, 0x49, 0xDE, 0x46, 0xCB, 0x83, 0x45
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef XMRIG_NO_CN_GPU
|
||||
// "cn/gpu"
|
||||
const static uint8_t test_output_gpu[160] = {
|
||||
0xE5, 0x5C, 0xB2, 0x3E, 0x51, 0x64, 0x9A, 0x59, 0xB1, 0x27, 0xB9, 0x6B, 0x51, 0x5F, 0x2B, 0xF7,
|
||||
0xBF, 0xEA, 0x19, 0x97, 0x41, 0xA0, 0x21, 0x6C, 0xF8, 0x38, 0xDE, 0xD0, 0x6E, 0xFF, 0x82, 0xDF,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* XMRIG_CRYPTONIGHT_TEST_H */
|
||||
1481
src/crypto/cn/CryptoNight_x86.h
Normal file
1481
src/crypto/cn/CryptoNight_x86.h
Normal file
File diff suppressed because it is too large
Load Diff
1497
src/crypto/cn/SSE2NEON.h
Normal file
1497
src/crypto/cn/SSE2NEON.h
Normal file
File diff suppressed because it is too large
Load Diff
281
src/crypto/cn/asm/CryptonightR_soft_aes_template.inc
Normal file
281
src/crypto/cn/asm/CryptonightR_soft_aes_template.inc
Normal file
@@ -0,0 +1,281 @@
|
||||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
|
||||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
|
||||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
|
||||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
|
||||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightR_soft_aes_template_part1):
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 232
|
||||
|
||||
mov eax, [rcx+96]
|
||||
mov ebx, [rcx+100]
|
||||
mov esi, [rcx+104]
|
||||
mov edx, [rcx+108]
|
||||
mov [rsp+144], eax
|
||||
mov [rsp+148], ebx
|
||||
mov [rsp+152], esi
|
||||
mov [rsp+156], edx
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 524288
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+328]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm2
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
pxor xmm6, xmm1
|
||||
pxor xmm6, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov ebp, [rsp+152]
|
||||
add ebx, [rsp+148]
|
||||
add ebp, [rsp+156]
|
||||
shl rbp, 32
|
||||
or rbx, rbp
|
||||
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
|
||||
mov [rsp+160], rbx
|
||||
mov [rsp+168], rdi
|
||||
mov [rsp+176], rbp
|
||||
mov [rsp+184], r10
|
||||
mov r10, rsp
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov esi, [rsp+148]
|
||||
mov edi, [rsp+152]
|
||||
mov ebp, [rsp+156]
|
||||
|
||||
movd esp, xmm7
|
||||
movaps xmm0, xmm7
|
||||
psrldq xmm0, 8
|
||||
movd r15d, xmm0
|
||||
movd eax, xmm4
|
||||
movd edx, xmm5
|
||||
movaps xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movd r9d, xmm0
|
||||
|
||||
FN_PREFIX(CryptonightR_soft_aes_template_part2):
|
||||
mov rsp, r10
|
||||
mov [rsp+144], ebx
|
||||
mov [rsp+148], esi
|
||||
mov [rsp+152], edi
|
||||
mov [rsp+156], ebp
|
||||
|
||||
mov edi, edi
|
||||
shl rbp, 32
|
||||
or rbp, rdi
|
||||
xor r8, rbp
|
||||
|
||||
mov ebx, ebx
|
||||
shl rsi, 32
|
||||
or rsi, rbx
|
||||
xor QWORD PTR [rsp+320], rsi
|
||||
|
||||
mov rbx, [rsp+160]
|
||||
mov rdi, [rsp+168]
|
||||
mov rbp, [rsp+176]
|
||||
mov r10, [rsp+184]
|
||||
|
||||
mov r9, r10
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm2
|
||||
pxor xmm6, xmm1
|
||||
paddq xmm1, xmm7
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm6, xmm0
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+320]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+304]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
sub r12d, 1
|
||||
jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
|
||||
|
||||
FN_PREFIX(CryptonightR_soft_aes_template_part3):
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 232
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
FN_PREFIX(CryptonightR_soft_aes_template_end):
|
||||
281
src/crypto/cn/asm/CryptonightR_soft_aes_template_win.inc
Normal file
281
src/crypto/cn/asm/CryptonightR_soft_aes_template_win.inc
Normal file
@@ -0,0 +1,281 @@
|
||||
PUBLIC CryptonightR_soft_aes_template_part1
|
||||
PUBLIC CryptonightR_soft_aes_template_mainloop
|
||||
PUBLIC CryptonightR_soft_aes_template_part2
|
||||
PUBLIC CryptonightR_soft_aes_template_part3
|
||||
PUBLIC CryptonightR_soft_aes_template_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_soft_aes_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 232
|
||||
|
||||
mov eax, [rcx+96]
|
||||
mov ebx, [rcx+100]
|
||||
mov esi, [rcx+104]
|
||||
mov edx, [rcx+108]
|
||||
mov [rsp+144], eax
|
||||
mov [rsp+148], ebx
|
||||
mov [rsp+152], esi
|
||||
mov [rsp+156], edx
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 524288
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_soft_aes_template_mainloop:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+328]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm2
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
pxor xmm6, xmm1
|
||||
pxor xmm6, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov ebp, [rsp+152]
|
||||
add ebx, [rsp+148]
|
||||
add ebp, [rsp+156]
|
||||
shl rbp, 32
|
||||
or rbx, rbp
|
||||
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
|
||||
mov [rsp+160], rbx
|
||||
mov [rsp+168], rdi
|
||||
mov [rsp+176], rbp
|
||||
mov [rsp+184], r10
|
||||
mov r10, rsp
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov esi, [rsp+148]
|
||||
mov edi, [rsp+152]
|
||||
mov ebp, [rsp+156]
|
||||
|
||||
movd esp, xmm7
|
||||
movaps xmm0, xmm7
|
||||
psrldq xmm0, 8
|
||||
movd r15d, xmm0
|
||||
movd eax, xmm4
|
||||
movd edx, xmm5
|
||||
movaps xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movd r9d, xmm0
|
||||
|
||||
CryptonightR_soft_aes_template_part2:
|
||||
mov rsp, r10
|
||||
mov [rsp+144], ebx
|
||||
mov [rsp+148], esi
|
||||
mov [rsp+152], edi
|
||||
mov [rsp+156], ebp
|
||||
|
||||
mov edi, edi
|
||||
shl rbp, 32
|
||||
or rbp, rdi
|
||||
xor r8, rbp
|
||||
|
||||
mov ebx, ebx
|
||||
shl rsi, 32
|
||||
or rsi, rbx
|
||||
xor QWORD PTR [rsp+320], rsi
|
||||
|
||||
mov rbx, [rsp+160]
|
||||
mov rdi, [rsp+168]
|
||||
mov rbp, [rsp+176]
|
||||
mov r10, [rsp+184]
|
||||
|
||||
mov r9, r10
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm2
|
||||
pxor xmm6, xmm1
|
||||
paddq xmm1, xmm7
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm6, xmm0
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+320]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+304]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
sub r12d, 1
|
||||
jne CryptonightR_soft_aes_template_mainloop
|
||||
|
||||
CryptonightR_soft_aes_template_part3:
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 232
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
CryptonightR_soft_aes_template_end:
|
||||
1595
src/crypto/cn/asm/CryptonightR_template.S
Normal file
1595
src/crypto/cn/asm/CryptonightR_template.S
Normal file
File diff suppressed because it is too large
Load Diff
1585
src/crypto/cn/asm/CryptonightR_template.asm
Normal file
1585
src/crypto/cn/asm/CryptonightR_template.asm
Normal file
File diff suppressed because it is too large
Load Diff
1087
src/crypto/cn/asm/CryptonightR_template.h
Normal file
1087
src/crypto/cn/asm/CryptonightR_template.h
Normal file
File diff suppressed because it is too large
Load Diff
536
src/crypto/cn/asm/CryptonightR_template.inc
Normal file
536
src/crypto/cn/asm/CryptonightR_template.inc
Normal file
@@ -0,0 +1,536 @@
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_part1)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_part2)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_part3)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_end)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
|
||||
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightR_template_part1):
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
mov r12, rcx
|
||||
mov r8, QWORD PTR [r12+32]
|
||||
mov rdx, r12
|
||||
xor r8, QWORD PTR [r12]
|
||||
mov r15, QWORD PTR [r12+40]
|
||||
mov r9, r8
|
||||
xor r15, QWORD PTR [r12+8]
|
||||
mov r11, QWORD PTR [r12+224]
|
||||
mov r12, QWORD PTR [r12+56]
|
||||
xor r12, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm0, r12
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
movaps XMMWORD PTR [rsp], xmm9
|
||||
mov r12, QWORD PTR [rdx+88]
|
||||
xor r12, QWORD PTR [rdx+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
and r9d, 2097136
|
||||
movq xmm0, r12
|
||||
movq xmm7, rax
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r10d, r9d
|
||||
movq xmm9, rsp
|
||||
mov rsp, r8
|
||||
mov r8d, 524288
|
||||
|
||||
mov ebx, [rdx+96]
|
||||
mov esi, [rdx+100]
|
||||
mov edi, [rdx+104]
|
||||
mov ebp, [rdx+108]
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightR_template_mainloop):
|
||||
movdqa xmm5, XMMWORD PTR [r9+r11]
|
||||
movq xmm0, r15
|
||||
movq xmm4, rsp
|
||||
punpcklqdq xmm4, xmm0
|
||||
lea rdx, QWORD PTR [r9+r11]
|
||||
|
||||
aesenc xmm5, xmm4
|
||||
|
||||
mov r13d, r9d
|
||||
mov eax, r9d
|
||||
xor r9d, 48
|
||||
xor r13d, 16
|
||||
xor eax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r9+r11]
|
||||
movaps xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [r13+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm5, xmm1
|
||||
pxor xmm5, xmm0
|
||||
|
||||
movq r12, xmm5
|
||||
movd r10d, xmm5
|
||||
and r10d, 2097136
|
||||
|
||||
paddq xmm3, xmm7
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm1, xmm4
|
||||
movdqu XMMWORD PTR [r13+r11], xmm3
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movdqu XMMWORD PTR [r9+r11], xmm1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
|
||||
lea r13d, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or r13, rdx
|
||||
|
||||
movd eax, xmm6
|
||||
movd edx, xmm7
|
||||
pextrd r9d, xmm7, 2
|
||||
|
||||
xor r13, QWORD PTR [r10+r11]
|
||||
mov r14, QWORD PTR [r10+r11+8]
|
||||
|
||||
FN_PREFIX(CryptonightR_template_part2):
|
||||
lea rcx, [r10+r11]
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor rsp, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r15, rax
|
||||
|
||||
mov rax, r13
|
||||
mul r12
|
||||
add r15, rax
|
||||
add rsp, rdx
|
||||
|
||||
mov r9d, r10d
|
||||
mov r12d, r10d
|
||||
xor r9d, 16
|
||||
xor r12d, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [r12+r11]
|
||||
movaps xmm3, xmm1
|
||||
movdqa xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqa xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm5, xmm0
|
||||
pxor xmm5, xmm1
|
||||
paddq xmm3, xmm4
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqu XMMWORD PTR [r12+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm3
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
mov QWORD PTR [rcx], rsp
|
||||
xor rsp, r13
|
||||
mov r9d, esp
|
||||
mov QWORD PTR [rcx+8], r15
|
||||
and r9d, 2097136
|
||||
xor r15, r14
|
||||
movdqa xmm6, xmm5
|
||||
dec r8d
|
||||
jnz FN_PREFIX(CryptonightR_template_mainloop)
|
||||
|
||||
FN_PREFIX(CryptonightR_template_part3):
|
||||
movq rsp, xmm9
|
||||
|
||||
mov rbx, QWORD PTR [rsp+136]
|
||||
mov rbp, QWORD PTR [rsp+144]
|
||||
mov rsi, QWORD PTR [rsp+152]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+16]
|
||||
movaps xmm9, XMMWORD PTR [rsp]
|
||||
add rsp, 64
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
ret 0
|
||||
FN_PREFIX(CryptonightR_template_end):
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightR_template_double_part1):
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 320
|
||||
mov r14, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r14, QWORD PTR [rcx]
|
||||
mov r12, QWORD PTR [rcx+40]
|
||||
mov ebx, r14d
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
and ebx, 2097136
|
||||
xor r12, QWORD PTR [rcx+8]
|
||||
mov rcx, QWORD PTR [rcx+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [rdx+32]
|
||||
xor r15, QWORD PTR [rdx]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
xor r13, QWORD PTR [rdx+8]
|
||||
movaps XMMWORD PTR [rsp+160], xmm6
|
||||
movaps XMMWORD PTR [rsp+176], xmm7
|
||||
movaps XMMWORD PTR [rsp+192], xmm8
|
||||
movaps XMMWORD PTR [rsp+208], xmm9
|
||||
movaps XMMWORD PTR [rsp+224], xmm10
|
||||
movaps XMMWORD PTR [rsp+240], xmm11
|
||||
movaps XMMWORD PTR [rsp+256], xmm12
|
||||
movaps XMMWORD PTR [rsp+272], xmm13
|
||||
movaps XMMWORD PTR [rsp+288], xmm14
|
||||
movaps XMMWORD PTR [rsp+304], xmm15
|
||||
movq xmm7, rax
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
|
||||
movaps xmm1, XMMWORD PTR [rdx+96]
|
||||
movaps xmm2, XMMWORD PTR [r8+96]
|
||||
movaps XMMWORD PTR [rsp], xmm1
|
||||
movaps XMMWORD PTR [rsp+16], xmm2
|
||||
|
||||
mov r8d, r15d
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movq xmm9, rax
|
||||
mov QWORD PTR [rsp+128], rsi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
punpcklqdq xmm9, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+88]
|
||||
xor rcx, QWORD PTR [rdx+72]
|
||||
movq xmm8, rax
|
||||
mov QWORD PTR [rsp+136], rdi
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm8, xmm0
|
||||
and r8d, 2097136
|
||||
movq xmm0, rcx
|
||||
mov r11d, 524288
|
||||
movq xmm10, rax
|
||||
punpcklqdq xmm10, xmm0
|
||||
|
||||
movq xmm14, QWORD PTR [rsp+128]
|
||||
movq xmm15, QWORD PTR [rsp+136]
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightR_template_double_mainloop):
|
||||
movdqu xmm6, XMMWORD PTR [rbx+rsi]
|
||||
movq xmm0, r12
|
||||
mov ecx, ebx
|
||||
movq xmm3, r14
|
||||
punpcklqdq xmm3, xmm0
|
||||
xor ebx, 16
|
||||
aesenc xmm6, xmm3
|
||||
movq xmm4, r15
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm0
|
||||
xor ebx, 48
|
||||
paddq xmm0, xmm7
|
||||
movdqu xmm1, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm1
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm0
|
||||
paddq xmm1, xmm3
|
||||
xor ebx, 16
|
||||
mov eax, ebx
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm0
|
||||
movq rdx, xmm6
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm1
|
||||
paddq xmm0, xmm9
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm0
|
||||
mov esi, edx
|
||||
movdqu xmm5, XMMWORD PTR [r8+rdi]
|
||||
and esi, 2097136
|
||||
mov ecx, r8d
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm4, xmm0
|
||||
xor r8d, 16
|
||||
aesenc xmm5, xmm4
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm0
|
||||
xor r8d, 48
|
||||
paddq xmm0, xmm8
|
||||
movdqu xmm1, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm1
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm0
|
||||
paddq xmm1, xmm4
|
||||
xor r8d, 16
|
||||
mov eax, r8d
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm0
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm1
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rdi], xmm0
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rcx+rdi], xmm0
|
||||
movq rdi, xmm5
|
||||
movq rcx, xmm14
|
||||
mov ebp, edi
|
||||
mov r8, QWORD PTR [rcx+rsi]
|
||||
mov r10, QWORD PTR [rcx+rsi+8]
|
||||
lea r9, QWORD PTR [rcx+rsi]
|
||||
xor esi, 16
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rsi
|
||||
movq xmm2, rdi
|
||||
movq xmm11, rbp
|
||||
movq xmm12, r15
|
||||
movq xmm13, rdx
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp+16]
|
||||
mov esi, DWORD PTR [rsp+20]
|
||||
mov edi, DWORD PTR [rsp+24]
|
||||
mov ebp, DWORD PTR [rsp+28]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r8, rax
|
||||
|
||||
movd esp, xmm3
|
||||
pextrd r15d, xmm3, 2
|
||||
movd eax, xmm7
|
||||
movd edx, xmm9
|
||||
pextrd r9d, xmm9, 2
|
||||
|
||||
FN_PREFIX(CryptonightR_template_double_part2):
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r14, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r12, rax
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp+16], ebx
|
||||
mov DWORD PTR [rsp+20], esi
|
||||
mov DWORD PTR [rsp+24], edi
|
||||
mov DWORD PTR [rsp+28], ebp
|
||||
|
||||
movq rsi, xmm1
|
||||
movq rdi, xmm2
|
||||
movq rbp, xmm11
|
||||
movq r15, xmm12
|
||||
movq rdx, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rbx, r8
|
||||
mov rax, r8
|
||||
mul rdx
|
||||
and ebp, 2097136
|
||||
mov r8, rax
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rsi]
|
||||
pxor xmm6, xmm1
|
||||
xor esi, 48
|
||||
paddq xmm1, xmm7
|
||||
movdqu xmm2, XMMWORD PTR [rsi+rcx]
|
||||
pxor xmm6, xmm2
|
||||
paddq xmm2, xmm3
|
||||
movdqu XMMWORD PTR [rsi+rcx], xmm1
|
||||
xor esi, 16
|
||||
mov eax, esi
|
||||
mov rsi, rcx
|
||||
movdqu xmm0, XMMWORD PTR [rax+rcx]
|
||||
pxor xmm6, xmm0
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm2
|
||||
paddq xmm0, xmm9
|
||||
add r12, r8
|
||||
xor rax, 32
|
||||
add r14, rdx
|
||||
movdqa xmm9, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
mov QWORD PTR [r9+8], r12
|
||||
xor r12, r10
|
||||
mov QWORD PTR [r9], r14
|
||||
movq rcx, xmm15
|
||||
xor r14, rbx
|
||||
mov r10d, ebp
|
||||
mov ebx, r14d
|
||||
xor ebp, 16
|
||||
and ebx, 2097136
|
||||
mov r8, QWORD PTR [r10+rcx]
|
||||
mov r9, QWORD PTR [r10+rcx+8]
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rbx
|
||||
movq xmm2, rsi
|
||||
movq xmm11, rdi
|
||||
movq xmm12, rbp
|
||||
movq xmm13, r15
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp]
|
||||
mov esi, DWORD PTR [rsp+4]
|
||||
mov edi, DWORD PTR [rsp+8]
|
||||
mov ebp, DWORD PTR [rsp+12]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
|
||||
xor r8, rax
|
||||
movq xmm3, r8
|
||||
|
||||
movd esp, xmm4
|
||||
pextrd r15d, xmm4, 2
|
||||
movd eax, xmm8
|
||||
movd edx, xmm10
|
||||
pextrd r9d, xmm10, 2
|
||||
|
||||
FN_PREFIX(CryptonightR_template_double_part3):
|
||||
|
||||
movq r15, xmm13
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r15, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r13, rax
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp], ebx
|
||||
mov DWORD PTR [rsp+4], esi
|
||||
mov DWORD PTR [rsp+8], edi
|
||||
mov DWORD PTR [rsp+12], ebp
|
||||
|
||||
movq rbx, xmm1
|
||||
movq rsi, xmm2
|
||||
movq rdi, xmm11
|
||||
movq rbp, xmm12
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rax, r8
|
||||
mul rdi
|
||||
mov rdi, rcx
|
||||
mov r8, rax
|
||||
movdqu xmm1, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm1
|
||||
xor ebp, 48
|
||||
paddq xmm1, xmm8
|
||||
add r13, r8
|
||||
movdqu xmm2, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm2
|
||||
add r15, rdx
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm1
|
||||
paddq xmm2, xmm4
|
||||
xor ebp, 16
|
||||
mov eax, ebp
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm0
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm2
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
movq rax, xmm3
|
||||
movdqa xmm10, xmm8
|
||||
mov QWORD PTR [r10+rcx], r15
|
||||
movdqa xmm8, xmm5
|
||||
xor r15, rax
|
||||
mov QWORD PTR [r10+rcx+8], r13
|
||||
mov r8d, r15d
|
||||
xor r13, r9
|
||||
and r8d, 2097136
|
||||
dec r11d
|
||||
jnz FN_PREFIX(CryptonightR_template_double_mainloop)
|
||||
|
||||
FN_PREFIX(CryptonightR_template_double_part4):
|
||||
|
||||
mov rbx, QWORD PTR [rsp+400]
|
||||
movaps xmm6, XMMWORD PTR [rsp+160]
|
||||
movaps xmm7, XMMWORD PTR [rsp+176]
|
||||
movaps xmm8, XMMWORD PTR [rsp+192]
|
||||
movaps xmm9, XMMWORD PTR [rsp+208]
|
||||
movaps xmm10, XMMWORD PTR [rsp+224]
|
||||
movaps xmm11, XMMWORD PTR [rsp+240]
|
||||
movaps xmm12, XMMWORD PTR [rsp+256]
|
||||
movaps xmm13, XMMWORD PTR [rsp+272]
|
||||
movaps xmm14, XMMWORD PTR [rsp+288]
|
||||
movaps xmm15, XMMWORD PTR [rsp+304]
|
||||
add rsp, 320
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
ret 0
|
||||
FN_PREFIX(CryptonightR_template_double_end):
|
||||
536
src/crypto/cn/asm/CryptonightR_template_win.inc
Normal file
536
src/crypto/cn/asm/CryptonightR_template_win.inc
Normal file
@@ -0,0 +1,536 @@
|
||||
PUBLIC CryptonightR_template_part1
|
||||
PUBLIC CryptonightR_template_mainloop
|
||||
PUBLIC CryptonightR_template_part2
|
||||
PUBLIC CryptonightR_template_part3
|
||||
PUBLIC CryptonightR_template_end
|
||||
PUBLIC CryptonightR_template_double_part1
|
||||
PUBLIC CryptonightR_template_double_mainloop
|
||||
PUBLIC CryptonightR_template_double_part2
|
||||
PUBLIC CryptonightR_template_double_part3
|
||||
PUBLIC CryptonightR_template_double_part4
|
||||
PUBLIC CryptonightR_template_double_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
mov r12, rcx
|
||||
mov r8, QWORD PTR [r12+32]
|
||||
mov rdx, r12
|
||||
xor r8, QWORD PTR [r12]
|
||||
mov r15, QWORD PTR [r12+40]
|
||||
mov r9, r8
|
||||
xor r15, QWORD PTR [r12+8]
|
||||
mov r11, QWORD PTR [r12+224]
|
||||
mov r12, QWORD PTR [r12+56]
|
||||
xor r12, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm0, r12
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
movaps XMMWORD PTR [rsp], xmm9
|
||||
mov r12, QWORD PTR [rdx+88]
|
||||
xor r12, QWORD PTR [rdx+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
and r9d, 2097136
|
||||
movq xmm0, r12
|
||||
movq xmm7, rax
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r10d, r9d
|
||||
movq xmm9, rsp
|
||||
mov rsp, r8
|
||||
mov r8d, 524288
|
||||
|
||||
mov ebx, [rdx+96]
|
||||
mov esi, [rdx+100]
|
||||
mov edi, [rdx+104]
|
||||
mov ebp, [rdx+108]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_mainloop:
|
||||
movdqa xmm5, XMMWORD PTR [r9+r11]
|
||||
movq xmm0, r15
|
||||
movq xmm4, rsp
|
||||
punpcklqdq xmm4, xmm0
|
||||
lea rdx, QWORD PTR [r9+r11]
|
||||
|
||||
aesenc xmm5, xmm4
|
||||
|
||||
mov r13d, r9d
|
||||
mov eax, r9d
|
||||
xor r9d, 48
|
||||
xor r13d, 16
|
||||
xor eax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r9+r11]
|
||||
movaps xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [r13+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm5, xmm1
|
||||
pxor xmm5, xmm0
|
||||
|
||||
movq r12, xmm5
|
||||
movd r10d, xmm5
|
||||
and r10d, 2097136
|
||||
|
||||
paddq xmm3, xmm7
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm1, xmm4
|
||||
movdqu XMMWORD PTR [r13+r11], xmm3
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movdqu XMMWORD PTR [r9+r11], xmm1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
|
||||
lea r13d, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or r13, rdx
|
||||
|
||||
movd eax, xmm6
|
||||
movd edx, xmm7
|
||||
pextrd r9d, xmm7, 2
|
||||
|
||||
xor r13, QWORD PTR [r10+r11]
|
||||
mov r14, QWORD PTR [r10+r11+8]
|
||||
|
||||
CryptonightR_template_part2:
|
||||
lea rcx, [r10+r11]
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor rsp, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r15, rax
|
||||
|
||||
mov rax, r13
|
||||
mul r12
|
||||
add r15, rax
|
||||
add rsp, rdx
|
||||
|
||||
mov r9d, r10d
|
||||
mov r12d, r10d
|
||||
xor r9d, 16
|
||||
xor r12d, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [r12+r11]
|
||||
movaps xmm3, xmm1
|
||||
movdqa xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqa xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm5, xmm0
|
||||
pxor xmm5, xmm1
|
||||
paddq xmm3, xmm4
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqu XMMWORD PTR [r12+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm3
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
mov QWORD PTR [rcx], rsp
|
||||
xor rsp, r13
|
||||
mov r9d, esp
|
||||
mov QWORD PTR [rcx+8], r15
|
||||
and r9d, 2097136
|
||||
xor r15, r14
|
||||
movdqa xmm6, xmm5
|
||||
dec r8d
|
||||
jnz CryptonightR_template_mainloop
|
||||
|
||||
CryptonightR_template_part3:
|
||||
movq rsp, xmm9
|
||||
|
||||
mov rbx, QWORD PTR [rsp+136]
|
||||
mov rbp, QWORD PTR [rsp+144]
|
||||
mov rsi, QWORD PTR [rsp+152]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+16]
|
||||
movaps xmm9, XMMWORD PTR [rsp]
|
||||
add rsp, 64
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
ret 0
|
||||
CryptonightR_template_end:
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_double_part1:
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 320
|
||||
mov r14, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r14, QWORD PTR [rcx]
|
||||
mov r12, QWORD PTR [rcx+40]
|
||||
mov ebx, r14d
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
and ebx, 2097136
|
||||
xor r12, QWORD PTR [rcx+8]
|
||||
mov rcx, QWORD PTR [rcx+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [rdx+32]
|
||||
xor r15, QWORD PTR [rdx]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
xor r13, QWORD PTR [rdx+8]
|
||||
movaps XMMWORD PTR [rsp+160], xmm6
|
||||
movaps XMMWORD PTR [rsp+176], xmm7
|
||||
movaps XMMWORD PTR [rsp+192], xmm8
|
||||
movaps XMMWORD PTR [rsp+208], xmm9
|
||||
movaps XMMWORD PTR [rsp+224], xmm10
|
||||
movaps XMMWORD PTR [rsp+240], xmm11
|
||||
movaps XMMWORD PTR [rsp+256], xmm12
|
||||
movaps XMMWORD PTR [rsp+272], xmm13
|
||||
movaps XMMWORD PTR [rsp+288], xmm14
|
||||
movaps XMMWORD PTR [rsp+304], xmm15
|
||||
movq xmm7, rax
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
|
||||
movaps xmm1, XMMWORD PTR [rdx+96]
|
||||
movaps xmm2, XMMWORD PTR [r8+96]
|
||||
movaps XMMWORD PTR [rsp], xmm1
|
||||
movaps XMMWORD PTR [rsp+16], xmm2
|
||||
|
||||
mov r8d, r15d
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movq xmm9, rax
|
||||
mov QWORD PTR [rsp+128], rsi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
punpcklqdq xmm9, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+88]
|
||||
xor rcx, QWORD PTR [rdx+72]
|
||||
movq xmm8, rax
|
||||
mov QWORD PTR [rsp+136], rdi
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm8, xmm0
|
||||
and r8d, 2097136
|
||||
movq xmm0, rcx
|
||||
mov r11d, 524288
|
||||
movq xmm10, rax
|
||||
punpcklqdq xmm10, xmm0
|
||||
|
||||
movq xmm14, QWORD PTR [rsp+128]
|
||||
movq xmm15, QWORD PTR [rsp+136]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_double_mainloop:
|
||||
movdqu xmm6, XMMWORD PTR [rbx+rsi]
|
||||
movq xmm0, r12
|
||||
mov ecx, ebx
|
||||
movq xmm3, r14
|
||||
punpcklqdq xmm3, xmm0
|
||||
xor ebx, 16
|
||||
aesenc xmm6, xmm3
|
||||
movq xmm4, r15
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm0
|
||||
xor ebx, 48
|
||||
paddq xmm0, xmm7
|
||||
movdqu xmm1, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm1
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm0
|
||||
paddq xmm1, xmm3
|
||||
xor ebx, 16
|
||||
mov eax, ebx
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm0
|
||||
movq rdx, xmm6
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm1
|
||||
paddq xmm0, xmm9
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm0
|
||||
mov esi, edx
|
||||
movdqu xmm5, XMMWORD PTR [r8+rdi]
|
||||
and esi, 2097136
|
||||
mov ecx, r8d
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm4, xmm0
|
||||
xor r8d, 16
|
||||
aesenc xmm5, xmm4
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm0
|
||||
xor r8d, 48
|
||||
paddq xmm0, xmm8
|
||||
movdqu xmm1, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm1
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm0
|
||||
paddq xmm1, xmm4
|
||||
xor r8d, 16
|
||||
mov eax, r8d
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm0
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm1
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rdi], xmm0
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rcx+rdi], xmm0
|
||||
movq rdi, xmm5
|
||||
movq rcx, xmm14
|
||||
mov ebp, edi
|
||||
mov r8, QWORD PTR [rcx+rsi]
|
||||
mov r10, QWORD PTR [rcx+rsi+8]
|
||||
lea r9, QWORD PTR [rcx+rsi]
|
||||
xor esi, 16
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rsi
|
||||
movq xmm2, rdi
|
||||
movq xmm11, rbp
|
||||
movq xmm12, r15
|
||||
movq xmm13, rdx
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp+16]
|
||||
mov esi, DWORD PTR [rsp+20]
|
||||
mov edi, DWORD PTR [rsp+24]
|
||||
mov ebp, DWORD PTR [rsp+28]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r8, rax
|
||||
|
||||
movd esp, xmm3
|
||||
pextrd r15d, xmm3, 2
|
||||
movd eax, xmm7
|
||||
movd edx, xmm9
|
||||
pextrd r9d, xmm9, 2
|
||||
|
||||
CryptonightR_template_double_part2:
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r14, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r12, rax
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp+16], ebx
|
||||
mov DWORD PTR [rsp+20], esi
|
||||
mov DWORD PTR [rsp+24], edi
|
||||
mov DWORD PTR [rsp+28], ebp
|
||||
|
||||
movq rsi, xmm1
|
||||
movq rdi, xmm2
|
||||
movq rbp, xmm11
|
||||
movq r15, xmm12
|
||||
movq rdx, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rbx, r8
|
||||
mov rax, r8
|
||||
mul rdx
|
||||
and ebp, 2097136
|
||||
mov r8, rax
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rsi]
|
||||
pxor xmm6, xmm1
|
||||
xor esi, 48
|
||||
paddq xmm1, xmm7
|
||||
movdqu xmm2, XMMWORD PTR [rsi+rcx]
|
||||
pxor xmm6, xmm2
|
||||
paddq xmm2, xmm3
|
||||
movdqu XMMWORD PTR [rsi+rcx], xmm1
|
||||
xor esi, 16
|
||||
mov eax, esi
|
||||
mov rsi, rcx
|
||||
movdqu xmm0, XMMWORD PTR [rax+rcx]
|
||||
pxor xmm6, xmm0
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm2
|
||||
paddq xmm0, xmm9
|
||||
add r12, r8
|
||||
xor rax, 32
|
||||
add r14, rdx
|
||||
movdqa xmm9, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
mov QWORD PTR [r9+8], r12
|
||||
xor r12, r10
|
||||
mov QWORD PTR [r9], r14
|
||||
movq rcx, xmm15
|
||||
xor r14, rbx
|
||||
mov r10d, ebp
|
||||
mov ebx, r14d
|
||||
xor ebp, 16
|
||||
and ebx, 2097136
|
||||
mov r8, QWORD PTR [r10+rcx]
|
||||
mov r9, QWORD PTR [r10+rcx+8]
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rbx
|
||||
movq xmm2, rsi
|
||||
movq xmm11, rdi
|
||||
movq xmm12, rbp
|
||||
movq xmm13, r15
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp]
|
||||
mov esi, DWORD PTR [rsp+4]
|
||||
mov edi, DWORD PTR [rsp+8]
|
||||
mov ebp, DWORD PTR [rsp+12]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
|
||||
xor r8, rax
|
||||
movq xmm3, r8
|
||||
|
||||
movd esp, xmm4
|
||||
pextrd r15d, xmm4, 2
|
||||
movd eax, xmm8
|
||||
movd edx, xmm10
|
||||
pextrd r9d, xmm10, 2
|
||||
|
||||
CryptonightR_template_double_part3:
|
||||
|
||||
movq r15, xmm13
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r15, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r13, rax
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp], ebx
|
||||
mov DWORD PTR [rsp+4], esi
|
||||
mov DWORD PTR [rsp+8], edi
|
||||
mov DWORD PTR [rsp+12], ebp
|
||||
|
||||
movq rbx, xmm1
|
||||
movq rsi, xmm2
|
||||
movq rdi, xmm11
|
||||
movq rbp, xmm12
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rax, r8
|
||||
mul rdi
|
||||
mov rdi, rcx
|
||||
mov r8, rax
|
||||
movdqu xmm1, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm1
|
||||
xor ebp, 48
|
||||
paddq xmm1, xmm8
|
||||
add r13, r8
|
||||
movdqu xmm2, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm2
|
||||
add r15, rdx
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm1
|
||||
paddq xmm2, xmm4
|
||||
xor ebp, 16
|
||||
mov eax, ebp
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm0
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm2
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
movq rax, xmm3
|
||||
movdqa xmm10, xmm8
|
||||
mov QWORD PTR [r10+rcx], r15
|
||||
movdqa xmm8, xmm5
|
||||
xor r15, rax
|
||||
mov QWORD PTR [r10+rcx+8], r13
|
||||
mov r8d, r15d
|
||||
xor r13, r9
|
||||
and r8d, 2097136
|
||||
dec r11d
|
||||
jnz CryptonightR_template_double_mainloop
|
||||
|
||||
CryptonightR_template_double_part4:
|
||||
|
||||
mov rbx, QWORD PTR [rsp+400]
|
||||
movaps xmm6, XMMWORD PTR [rsp+160]
|
||||
movaps xmm7, XMMWORD PTR [rsp+176]
|
||||
movaps xmm8, XMMWORD PTR [rsp+192]
|
||||
movaps xmm9, XMMWORD PTR [rsp+208]
|
||||
movaps xmm10, XMMWORD PTR [rsp+224]
|
||||
movaps xmm11, XMMWORD PTR [rsp+240]
|
||||
movaps xmm12, XMMWORD PTR [rsp+256]
|
||||
movaps xmm13, XMMWORD PTR [rsp+272]
|
||||
movaps xmm14, XMMWORD PTR [rsp+288]
|
||||
movaps xmm15, XMMWORD PTR [rsp+304]
|
||||
add rsp, 320
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
ret 0
|
||||
CryptonightR_template_double_end:
|
||||
268
src/crypto/cn/asm/CryptonightWOW_soft_aes_template.inc
Normal file
268
src/crypto/cn/asm/CryptonightWOW_soft_aes_template.inc
Normal file
@@ -0,0 +1,268 @@
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end)
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightWOW_soft_aes_template_part1):
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 232
|
||||
|
||||
mov eax, [rcx+96]
|
||||
mov ebx, [rcx+100]
|
||||
mov esi, [rcx+104]
|
||||
mov edx, [rcx+108]
|
||||
mov [rsp+144], eax
|
||||
mov [rsp+148], ebx
|
||||
mov [rsp+152], esi
|
||||
mov [rsp+156], edx
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 524288
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop):
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+328]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov ebp, [rsp+152]
|
||||
add ebx, [rsp+148]
|
||||
add ebp, [rsp+156]
|
||||
shl rbp, 32
|
||||
or rbx, rbp
|
||||
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
|
||||
mov [rsp+160], rbx
|
||||
mov [rsp+168], rdi
|
||||
mov [rsp+176], rbp
|
||||
mov [rsp+184], r10
|
||||
mov r10, rsp
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov esi, [rsp+148]
|
||||
mov edi, [rsp+152]
|
||||
mov ebp, [rsp+156]
|
||||
|
||||
movd esp, xmm7
|
||||
movaps xmm0, xmm7
|
||||
psrldq xmm0, 8
|
||||
movd r15d, xmm0
|
||||
movd eax, xmm4
|
||||
movd edx, xmm5
|
||||
|
||||
FN_PREFIX(CryptonightWOW_soft_aes_template_part2):
|
||||
mov rsp, r10
|
||||
mov [rsp+144], ebx
|
||||
mov [rsp+148], esi
|
||||
mov [rsp+152], edi
|
||||
mov [rsp+156], ebp
|
||||
|
||||
mov rbx, [rsp+160]
|
||||
mov rdi, [rsp+168]
|
||||
mov rbp, [rsp+176]
|
||||
mov r10, [rsp+184]
|
||||
|
||||
mov r9, r10
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+320]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+304]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
sub r12d, 1
|
||||
jne FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
|
||||
|
||||
FN_PREFIX(CryptonightWOW_soft_aes_template_part3):
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 232
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
FN_PREFIX(CryptonightWOW_soft_aes_template_end):
|
||||
268
src/crypto/cn/asm/CryptonightWOW_soft_aes_template_win.inc
Normal file
268
src/crypto/cn/asm/CryptonightWOW_soft_aes_template_win.inc
Normal file
@@ -0,0 +1,268 @@
|
||||
PUBLIC CryptonightWOW_soft_aes_template_part1
|
||||
PUBLIC CryptonightWOW_soft_aes_template_mainloop
|
||||
PUBLIC CryptonightWOW_soft_aes_template_part2
|
||||
PUBLIC CryptonightWOW_soft_aes_template_part3
|
||||
PUBLIC CryptonightWOW_soft_aes_template_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_soft_aes_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 232
|
||||
|
||||
mov eax, [rcx+96]
|
||||
mov ebx, [rcx+100]
|
||||
mov esi, [rcx+104]
|
||||
mov edx, [rcx+108]
|
||||
mov [rsp+144], eax
|
||||
mov [rsp+148], ebx
|
||||
mov [rsp+152], esi
|
||||
mov [rsp+156], edx
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 524288
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_soft_aes_template_mainloop:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+328]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov ebp, [rsp+152]
|
||||
add ebx, [rsp+148]
|
||||
add ebp, [rsp+156]
|
||||
shl rbp, 32
|
||||
or rbx, rbp
|
||||
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
|
||||
mov [rsp+160], rbx
|
||||
mov [rsp+168], rdi
|
||||
mov [rsp+176], rbp
|
||||
mov [rsp+184], r10
|
||||
mov r10, rsp
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov esi, [rsp+148]
|
||||
mov edi, [rsp+152]
|
||||
mov ebp, [rsp+156]
|
||||
|
||||
movd esp, xmm7
|
||||
movaps xmm0, xmm7
|
||||
psrldq xmm0, 8
|
||||
movd r15d, xmm0
|
||||
movd eax, xmm4
|
||||
movd edx, xmm5
|
||||
|
||||
CryptonightWOW_soft_aes_template_part2:
|
||||
mov rsp, r10
|
||||
mov [rsp+144], ebx
|
||||
mov [rsp+148], esi
|
||||
mov [rsp+152], edi
|
||||
mov [rsp+156], ebp
|
||||
|
||||
mov rbx, [rsp+160]
|
||||
mov rdi, [rsp+168]
|
||||
mov rbp, [rsp+176]
|
||||
mov r10, [rsp+184]
|
||||
|
||||
mov r9, r10
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+320]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+304]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
sub r12d, 1
|
||||
jne CryptonightWOW_soft_aes_template_mainloop
|
||||
|
||||
CryptonightWOW_soft_aes_template_part3:
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 232
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
CryptonightWOW_soft_aes_template_end:
|
||||
491
src/crypto/cn/asm/CryptonightWOW_template.inc
Normal file
491
src/crypto/cn/asm/CryptonightWOW_template.inc
Normal file
@@ -0,0 +1,491 @@
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_part1)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_part2)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_part3)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_end)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4)
|
||||
PUBLIC FN_PREFIX(CryptonightWOW_template_double_end)
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightWOW_template_part1):
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
mov r12, rcx
|
||||
mov r8, QWORD PTR [r12+32]
|
||||
mov rdx, r12
|
||||
xor r8, QWORD PTR [r12]
|
||||
mov r15, QWORD PTR [r12+40]
|
||||
mov r9, r8
|
||||
xor r15, QWORD PTR [r12+8]
|
||||
mov r11, QWORD PTR [r12+224]
|
||||
mov r12, QWORD PTR [r12+56]
|
||||
xor r12, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm0, r12
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
movaps XMMWORD PTR [rsp], xmm9
|
||||
mov r12, QWORD PTR [rdx+88]
|
||||
xor r12, QWORD PTR [rdx+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
and r9d, 2097136
|
||||
movq xmm0, r12
|
||||
movq xmm7, rax
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r10d, r9d
|
||||
movq xmm9, rsp
|
||||
mov rsp, r8
|
||||
mov r8d, 524288
|
||||
|
||||
mov ebx, [rdx+96]
|
||||
mov esi, [rdx+100]
|
||||
mov edi, [rdx+104]
|
||||
mov ebp, [rdx+108]
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightWOW_template_mainloop):
|
||||
movdqa xmm5, XMMWORD PTR [r9+r11]
|
||||
movq xmm0, r15
|
||||
movq xmm4, rsp
|
||||
punpcklqdq xmm4, xmm0
|
||||
lea rdx, QWORD PTR [r9+r11]
|
||||
|
||||
aesenc xmm5, xmm4
|
||||
movd r10d, xmm5
|
||||
and r10d, 2097136
|
||||
|
||||
mov r12d, r9d
|
||||
mov eax, r9d
|
||||
xor r9d, 48
|
||||
xor r12d, 16
|
||||
xor eax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm2, XMMWORD PTR [r12+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm1, xmm4
|
||||
movdqu XMMWORD PTR [r12+r11], xmm0
|
||||
movq r12, xmm5
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movdqu XMMWORD PTR [r9+r11], xmm1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
|
||||
lea r13d, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or r13, rdx
|
||||
|
||||
xor r13, QWORD PTR [r10+r11]
|
||||
mov r14, QWORD PTR [r10+r11+8]
|
||||
|
||||
movd eax, xmm6
|
||||
movd edx, xmm7
|
||||
pextrd r9d, xmm7, 2
|
||||
|
||||
FN_PREFIX(CryptonightWOW_template_part2):
|
||||
mov rax, r13
|
||||
mul r12
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
mov r12d, r10d
|
||||
xor r9d, 16
|
||||
xor r12d, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [r12+r11]
|
||||
xor rdx, QWORD PTR [r12+r11]
|
||||
xor rax, QWORD PTR [r11+r12+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+r11]
|
||||
pxor xmm3, xmm2
|
||||
paddq xmm7, XMMWORD PTR [r10+r11]
|
||||
paddq xmm1, xmm4
|
||||
paddq xmm3, xmm6
|
||||
movdqu XMMWORD PTR [r9+r11], xmm7
|
||||
movdqu XMMWORD PTR [r12+r11], xmm3
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
add r15, rax
|
||||
add rsp, rdx
|
||||
xor r10, 48
|
||||
mov QWORD PTR [r10+r11], rsp
|
||||
xor rsp, r13
|
||||
mov r9d, esp
|
||||
mov QWORD PTR [r10+r11+8], r15
|
||||
and r9d, 2097136
|
||||
xor r15, r14
|
||||
movdqa xmm6, xmm5
|
||||
dec r8d
|
||||
jnz FN_PREFIX(CryptonightWOW_template_mainloop)
|
||||
|
||||
FN_PREFIX(CryptonightWOW_template_part3):
|
||||
movq rsp, xmm9
|
||||
|
||||
mov rbx, QWORD PTR [rsp+136]
|
||||
mov rbp, QWORD PTR [rsp+144]
|
||||
mov rsi, QWORD PTR [rsp+152]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+16]
|
||||
movaps xmm9, XMMWORD PTR [rsp]
|
||||
add rsp, 64
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
ret 0
|
||||
FN_PREFIX(CryptonightWOW_template_end):
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightWOW_template_double_part1):
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 320
|
||||
mov r14, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r14, QWORD PTR [rcx]
|
||||
mov r12, QWORD PTR [rcx+40]
|
||||
mov ebx, r14d
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
and ebx, 2097136
|
||||
xor r12, QWORD PTR [rcx+8]
|
||||
mov rcx, QWORD PTR [rcx+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [rdx+32]
|
||||
xor r15, QWORD PTR [rdx]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
xor r13, QWORD PTR [rdx+8]
|
||||
movaps XMMWORD PTR [rsp+160], xmm6
|
||||
movaps XMMWORD PTR [rsp+176], xmm7
|
||||
movaps XMMWORD PTR [rsp+192], xmm8
|
||||
movaps XMMWORD PTR [rsp+208], xmm9
|
||||
movaps XMMWORD PTR [rsp+224], xmm10
|
||||
movaps XMMWORD PTR [rsp+240], xmm11
|
||||
movaps XMMWORD PTR [rsp+256], xmm12
|
||||
movaps XMMWORD PTR [rsp+272], xmm13
|
||||
movaps XMMWORD PTR [rsp+288], xmm14
|
||||
movaps XMMWORD PTR [rsp+304], xmm15
|
||||
movq xmm7, rax
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
|
||||
movaps xmm1, XMMWORD PTR [rdx+96]
|
||||
movaps xmm2, XMMWORD PTR [r8+96]
|
||||
movaps XMMWORD PTR [rsp], xmm1
|
||||
movaps XMMWORD PTR [rsp+16], xmm2
|
||||
|
||||
mov r8d, r15d
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movq xmm9, rax
|
||||
mov QWORD PTR [rsp+128], rsi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
punpcklqdq xmm9, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+88]
|
||||
xor rcx, QWORD PTR [rdx+72]
|
||||
movq xmm8, rax
|
||||
mov QWORD PTR [rsp+136], rdi
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm8, xmm0
|
||||
and r8d, 2097136
|
||||
movq xmm0, rcx
|
||||
mov r11d, 524288
|
||||
movq xmm10, rax
|
||||
punpcklqdq xmm10, xmm0
|
||||
|
||||
movq xmm14, QWORD PTR [rsp+128]
|
||||
movq xmm15, QWORD PTR [rsp+136]
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(CryptonightWOW_template_double_mainloop):
|
||||
movdqu xmm6, XMMWORD PTR [rbx+rsi]
|
||||
movq xmm0, r12
|
||||
mov ecx, ebx
|
||||
movq xmm3, r14
|
||||
punpcklqdq xmm3, xmm0
|
||||
xor ebx, 16
|
||||
aesenc xmm6, xmm3
|
||||
movq rdx, xmm6
|
||||
movq xmm4, r15
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
xor ebx, 48
|
||||
paddq xmm0, xmm7
|
||||
movdqu xmm1, XMMWORD PTR [rbx+rsi]
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm0
|
||||
paddq xmm1, xmm3
|
||||
xor ebx, 16
|
||||
mov eax, ebx
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm1
|
||||
paddq xmm0, xmm9
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm0
|
||||
mov esi, edx
|
||||
movdqu xmm5, XMMWORD PTR [r8+rdi]
|
||||
and esi, 2097136
|
||||
mov ecx, r8d
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm4, xmm0
|
||||
xor r8d, 16
|
||||
aesenc xmm5, xmm4
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
xor r8d, 48
|
||||
paddq xmm0, xmm8
|
||||
movdqu xmm1, XMMWORD PTR [r8+rdi]
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm0
|
||||
paddq xmm1, xmm4
|
||||
xor r8d, 16
|
||||
mov eax, r8d
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm1
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rdi], xmm0
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rcx+rdi], xmm0
|
||||
movq rdi, xmm5
|
||||
movq rcx, xmm14
|
||||
mov ebp, edi
|
||||
mov r8, QWORD PTR [rcx+rsi]
|
||||
mov r10, QWORD PTR [rcx+rsi+8]
|
||||
lea r9, QWORD PTR [rcx+rsi]
|
||||
xor esi, 16
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rsi
|
||||
movq xmm2, rdi
|
||||
movq xmm11, rbp
|
||||
movq xmm12, r15
|
||||
movq xmm13, rdx
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp+16]
|
||||
mov esi, DWORD PTR [rsp+20]
|
||||
mov edi, DWORD PTR [rsp+24]
|
||||
mov ebp, DWORD PTR [rsp+28]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r8, rax
|
||||
|
||||
movd esp, xmm3
|
||||
pextrd r15d, xmm3, 2
|
||||
movd eax, xmm7
|
||||
movd edx, xmm9
|
||||
pextrd r9d, xmm9, 2
|
||||
|
||||
FN_PREFIX(CryptonightWOW_template_double_part2):
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp+16], ebx
|
||||
mov DWORD PTR [rsp+20], esi
|
||||
mov DWORD PTR [rsp+24], edi
|
||||
mov DWORD PTR [rsp+28], ebp
|
||||
|
||||
movq rsi, xmm1
|
||||
movq rdi, xmm2
|
||||
movq rbp, xmm11
|
||||
movq r15, xmm12
|
||||
movq rdx, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rbx, r8
|
||||
mov rax, r8
|
||||
mul rdx
|
||||
and ebp, 2097136
|
||||
mov r8, rax
|
||||
movq xmm1, rdx
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
pxor xmm1, XMMWORD PTR [rcx+rsi]
|
||||
xor esi, 48
|
||||
paddq xmm1, xmm7
|
||||
movdqu xmm2, XMMWORD PTR [rsi+rcx]
|
||||
xor rdx, QWORD PTR [rsi+rcx]
|
||||
paddq xmm2, xmm3
|
||||
xor r8, QWORD PTR [rsi+rcx+8]
|
||||
movdqu XMMWORD PTR [rsi+rcx], xmm1
|
||||
xor esi, 16
|
||||
mov eax, esi
|
||||
mov rsi, rcx
|
||||
movdqu xmm0, XMMWORD PTR [rax+rcx]
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm2
|
||||
paddq xmm0, xmm9
|
||||
add r12, r8
|
||||
xor rax, 32
|
||||
add r14, rdx
|
||||
movdqa xmm9, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
mov QWORD PTR [r9+8], r12
|
||||
xor r12, r10
|
||||
mov QWORD PTR [r9], r14
|
||||
movq rcx, xmm15
|
||||
xor r14, rbx
|
||||
mov r10d, ebp
|
||||
mov ebx, r14d
|
||||
xor ebp, 16
|
||||
and ebx, 2097136
|
||||
mov r8, QWORD PTR [r10+rcx]
|
||||
mov r9, QWORD PTR [r10+rcx+8]
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rbx
|
||||
movq xmm2, rsi
|
||||
movq xmm11, rdi
|
||||
movq xmm12, rbp
|
||||
movq xmm13, r15
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp]
|
||||
mov esi, DWORD PTR [rsp+4]
|
||||
mov edi, DWORD PTR [rsp+8]
|
||||
mov ebp, DWORD PTR [rsp+12]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
|
||||
xor r8, rax
|
||||
movq xmm3, r8
|
||||
|
||||
movd esp, xmm4
|
||||
pextrd r15d, xmm4, 2
|
||||
movd eax, xmm8
|
||||
movd edx, xmm10
|
||||
pextrd r9d, xmm10, 2
|
||||
|
||||
FN_PREFIX(CryptonightWOW_template_double_part3):
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp], ebx
|
||||
mov DWORD PTR [rsp+4], esi
|
||||
mov DWORD PTR [rsp+8], edi
|
||||
mov DWORD PTR [rsp+12], ebp
|
||||
|
||||
movq rbx, xmm1
|
||||
movq rsi, xmm2
|
||||
movq rdi, xmm11
|
||||
movq rbp, xmm12
|
||||
movq r15, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rax, r8
|
||||
mul rdi
|
||||
movq xmm1, rdx
|
||||
movq xmm0, rax
|
||||
punpcklqdq xmm1, xmm0
|
||||
mov rdi, rcx
|
||||
mov r8, rax
|
||||
pxor xmm1, XMMWORD PTR [rbp+rcx]
|
||||
xor ebp, 48
|
||||
paddq xmm1, xmm8
|
||||
xor r8, QWORD PTR [rbp+rcx+8]
|
||||
xor rdx, QWORD PTR [rbp+rcx]
|
||||
add r13, r8
|
||||
movdqu xmm2, XMMWORD PTR [rbp+rcx]
|
||||
add r15, rdx
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm1
|
||||
paddq xmm2, xmm4
|
||||
xor ebp, 16
|
||||
mov eax, ebp
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbp+rcx]
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm2
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
movq rax, xmm3
|
||||
movdqa xmm10, xmm8
|
||||
mov QWORD PTR [r10+rcx], r15
|
||||
movdqa xmm8, xmm5
|
||||
xor r15, rax
|
||||
mov QWORD PTR [r10+rcx+8], r13
|
||||
mov r8d, r15d
|
||||
xor r13, r9
|
||||
and r8d, 2097136
|
||||
dec r11d
|
||||
jnz FN_PREFIX(CryptonightWOW_template_double_mainloop)
|
||||
|
||||
FN_PREFIX(CryptonightWOW_template_double_part4):
|
||||
|
||||
mov rbx, QWORD PTR [rsp+400]
|
||||
movaps xmm6, XMMWORD PTR [rsp+160]
|
||||
movaps xmm7, XMMWORD PTR [rsp+176]
|
||||
movaps xmm8, XMMWORD PTR [rsp+192]
|
||||
movaps xmm9, XMMWORD PTR [rsp+208]
|
||||
movaps xmm10, XMMWORD PTR [rsp+224]
|
||||
movaps xmm11, XMMWORD PTR [rsp+240]
|
||||
movaps xmm12, XMMWORD PTR [rsp+256]
|
||||
movaps xmm13, XMMWORD PTR [rsp+272]
|
||||
movaps xmm14, XMMWORD PTR [rsp+288]
|
||||
movaps xmm15, XMMWORD PTR [rsp+304]
|
||||
add rsp, 320
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
ret 0
|
||||
FN_PREFIX(CryptonightWOW_template_double_end):
|
||||
491
src/crypto/cn/asm/CryptonightWOW_template_win.inc
Normal file
491
src/crypto/cn/asm/CryptonightWOW_template_win.inc
Normal file
@@ -0,0 +1,491 @@
|
||||
PUBLIC CryptonightWOW_template_part1
|
||||
PUBLIC CryptonightWOW_template_mainloop
|
||||
PUBLIC CryptonightWOW_template_part2
|
||||
PUBLIC CryptonightWOW_template_part3
|
||||
PUBLIC CryptonightWOW_template_end
|
||||
PUBLIC CryptonightWOW_template_double_part1
|
||||
PUBLIC CryptonightWOW_template_double_mainloop
|
||||
PUBLIC CryptonightWOW_template_double_part2
|
||||
PUBLIC CryptonightWOW_template_double_part3
|
||||
PUBLIC CryptonightWOW_template_double_part4
|
||||
PUBLIC CryptonightWOW_template_double_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
mov r12, rcx
|
||||
mov r8, QWORD PTR [r12+32]
|
||||
mov rdx, r12
|
||||
xor r8, QWORD PTR [r12]
|
||||
mov r15, QWORD PTR [r12+40]
|
||||
mov r9, r8
|
||||
xor r15, QWORD PTR [r12+8]
|
||||
mov r11, QWORD PTR [r12+224]
|
||||
mov r12, QWORD PTR [r12+56]
|
||||
xor r12, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm0, r12
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
movaps XMMWORD PTR [rsp], xmm9
|
||||
mov r12, QWORD PTR [rdx+88]
|
||||
xor r12, QWORD PTR [rdx+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
and r9d, 2097136
|
||||
movq xmm0, r12
|
||||
movq xmm7, rax
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r10d, r9d
|
||||
movq xmm9, rsp
|
||||
mov rsp, r8
|
||||
mov r8d, 524288
|
||||
|
||||
mov ebx, [rdx+96]
|
||||
mov esi, [rdx+100]
|
||||
mov edi, [rdx+104]
|
||||
mov ebp, [rdx+108]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_mainloop:
|
||||
movdqa xmm5, XMMWORD PTR [r9+r11]
|
||||
movq xmm0, r15
|
||||
movq xmm4, rsp
|
||||
punpcklqdq xmm4, xmm0
|
||||
lea rdx, QWORD PTR [r9+r11]
|
||||
|
||||
aesenc xmm5, xmm4
|
||||
movd r10d, xmm5
|
||||
and r10d, 2097136
|
||||
|
||||
mov r12d, r9d
|
||||
mov eax, r9d
|
||||
xor r9d, 48
|
||||
xor r12d, 16
|
||||
xor eax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm2, XMMWORD PTR [r12+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm1, xmm4
|
||||
movdqu XMMWORD PTR [r12+r11], xmm0
|
||||
movq r12, xmm5
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movdqu XMMWORD PTR [r9+r11], xmm1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
|
||||
lea r13d, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or r13, rdx
|
||||
|
||||
xor r13, QWORD PTR [r10+r11]
|
||||
mov r14, QWORD PTR [r10+r11+8]
|
||||
|
||||
movd eax, xmm6
|
||||
movd edx, xmm7
|
||||
pextrd r9d, xmm7, 2
|
||||
|
||||
CryptonightWOW_template_part2:
|
||||
mov rax, r13
|
||||
mul r12
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
mov r12d, r10d
|
||||
xor r9d, 16
|
||||
xor r12d, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [r12+r11]
|
||||
xor rdx, QWORD PTR [r12+r11]
|
||||
xor rax, QWORD PTR [r11+r12+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+r11]
|
||||
pxor xmm3, xmm2
|
||||
paddq xmm7, XMMWORD PTR [r10+r11]
|
||||
paddq xmm1, xmm4
|
||||
paddq xmm3, xmm6
|
||||
movdqu XMMWORD PTR [r9+r11], xmm7
|
||||
movdqu XMMWORD PTR [r12+r11], xmm3
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
add r15, rax
|
||||
add rsp, rdx
|
||||
xor r10, 48
|
||||
mov QWORD PTR [r10+r11], rsp
|
||||
xor rsp, r13
|
||||
mov r9d, esp
|
||||
mov QWORD PTR [r10+r11+8], r15
|
||||
and r9d, 2097136
|
||||
xor r15, r14
|
||||
movdqa xmm6, xmm5
|
||||
dec r8d
|
||||
jnz CryptonightWOW_template_mainloop
|
||||
|
||||
CryptonightWOW_template_part3:
|
||||
movq rsp, xmm9
|
||||
|
||||
mov rbx, QWORD PTR [rsp+136]
|
||||
mov rbp, QWORD PTR [rsp+144]
|
||||
mov rsi, QWORD PTR [rsp+152]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+16]
|
||||
movaps xmm9, XMMWORD PTR [rsp]
|
||||
add rsp, 64
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
ret 0
|
||||
CryptonightWOW_template_end:
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_double_part1:
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 320
|
||||
mov r14, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r14, QWORD PTR [rcx]
|
||||
mov r12, QWORD PTR [rcx+40]
|
||||
mov ebx, r14d
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
and ebx, 2097136
|
||||
xor r12, QWORD PTR [rcx+8]
|
||||
mov rcx, QWORD PTR [rcx+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [rdx+32]
|
||||
xor r15, QWORD PTR [rdx]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
xor r13, QWORD PTR [rdx+8]
|
||||
movaps XMMWORD PTR [rsp+160], xmm6
|
||||
movaps XMMWORD PTR [rsp+176], xmm7
|
||||
movaps XMMWORD PTR [rsp+192], xmm8
|
||||
movaps XMMWORD PTR [rsp+208], xmm9
|
||||
movaps XMMWORD PTR [rsp+224], xmm10
|
||||
movaps XMMWORD PTR [rsp+240], xmm11
|
||||
movaps XMMWORD PTR [rsp+256], xmm12
|
||||
movaps XMMWORD PTR [rsp+272], xmm13
|
||||
movaps XMMWORD PTR [rsp+288], xmm14
|
||||
movaps XMMWORD PTR [rsp+304], xmm15
|
||||
movq xmm7, rax
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
|
||||
movaps xmm1, XMMWORD PTR [rdx+96]
|
||||
movaps xmm2, XMMWORD PTR [r8+96]
|
||||
movaps XMMWORD PTR [rsp], xmm1
|
||||
movaps XMMWORD PTR [rsp+16], xmm2
|
||||
|
||||
mov r8d, r15d
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movq xmm9, rax
|
||||
mov QWORD PTR [rsp+128], rsi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
punpcklqdq xmm9, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+88]
|
||||
xor rcx, QWORD PTR [rdx+72]
|
||||
movq xmm8, rax
|
||||
mov QWORD PTR [rsp+136], rdi
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm8, xmm0
|
||||
and r8d, 2097136
|
||||
movq xmm0, rcx
|
||||
mov r11d, 524288
|
||||
movq xmm10, rax
|
||||
punpcklqdq xmm10, xmm0
|
||||
|
||||
movq xmm14, QWORD PTR [rsp+128]
|
||||
movq xmm15, QWORD PTR [rsp+136]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_double_mainloop:
|
||||
movdqu xmm6, XMMWORD PTR [rbx+rsi]
|
||||
movq xmm0, r12
|
||||
mov ecx, ebx
|
||||
movq xmm3, r14
|
||||
punpcklqdq xmm3, xmm0
|
||||
xor ebx, 16
|
||||
aesenc xmm6, xmm3
|
||||
movq rdx, xmm6
|
||||
movq xmm4, r15
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
xor ebx, 48
|
||||
paddq xmm0, xmm7
|
||||
movdqu xmm1, XMMWORD PTR [rbx+rsi]
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm0
|
||||
paddq xmm1, xmm3
|
||||
xor ebx, 16
|
||||
mov eax, ebx
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm1
|
||||
paddq xmm0, xmm9
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm0
|
||||
mov esi, edx
|
||||
movdqu xmm5, XMMWORD PTR [r8+rdi]
|
||||
and esi, 2097136
|
||||
mov ecx, r8d
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm4, xmm0
|
||||
xor r8d, 16
|
||||
aesenc xmm5, xmm4
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
xor r8d, 48
|
||||
paddq xmm0, xmm8
|
||||
movdqu xmm1, XMMWORD PTR [r8+rdi]
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm0
|
||||
paddq xmm1, xmm4
|
||||
xor r8d, 16
|
||||
mov eax, r8d
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm1
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rdi], xmm0
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rcx+rdi], xmm0
|
||||
movq rdi, xmm5
|
||||
movq rcx, xmm14
|
||||
mov ebp, edi
|
||||
mov r8, QWORD PTR [rcx+rsi]
|
||||
mov r10, QWORD PTR [rcx+rsi+8]
|
||||
lea r9, QWORD PTR [rcx+rsi]
|
||||
xor esi, 16
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rsi
|
||||
movq xmm2, rdi
|
||||
movq xmm11, rbp
|
||||
movq xmm12, r15
|
||||
movq xmm13, rdx
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp+16]
|
||||
mov esi, DWORD PTR [rsp+20]
|
||||
mov edi, DWORD PTR [rsp+24]
|
||||
mov ebp, DWORD PTR [rsp+28]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r8, rax
|
||||
|
||||
movd esp, xmm3
|
||||
pextrd r15d, xmm3, 2
|
||||
movd eax, xmm7
|
||||
movd edx, xmm9
|
||||
pextrd r9d, xmm9, 2
|
||||
|
||||
CryptonightWOW_template_double_part2:
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp+16], ebx
|
||||
mov DWORD PTR [rsp+20], esi
|
||||
mov DWORD PTR [rsp+24], edi
|
||||
mov DWORD PTR [rsp+28], ebp
|
||||
|
||||
movq rsi, xmm1
|
||||
movq rdi, xmm2
|
||||
movq rbp, xmm11
|
||||
movq r15, xmm12
|
||||
movq rdx, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rbx, r8
|
||||
mov rax, r8
|
||||
mul rdx
|
||||
and ebp, 2097136
|
||||
mov r8, rax
|
||||
movq xmm1, rdx
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
pxor xmm1, XMMWORD PTR [rcx+rsi]
|
||||
xor esi, 48
|
||||
paddq xmm1, xmm7
|
||||
movdqu xmm2, XMMWORD PTR [rsi+rcx]
|
||||
xor rdx, QWORD PTR [rsi+rcx]
|
||||
paddq xmm2, xmm3
|
||||
xor r8, QWORD PTR [rsi+rcx+8]
|
||||
movdqu XMMWORD PTR [rsi+rcx], xmm1
|
||||
xor esi, 16
|
||||
mov eax, esi
|
||||
mov rsi, rcx
|
||||
movdqu xmm0, XMMWORD PTR [rax+rcx]
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm2
|
||||
paddq xmm0, xmm9
|
||||
add r12, r8
|
||||
xor rax, 32
|
||||
add r14, rdx
|
||||
movdqa xmm9, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
mov QWORD PTR [r9+8], r12
|
||||
xor r12, r10
|
||||
mov QWORD PTR [r9], r14
|
||||
movq rcx, xmm15
|
||||
xor r14, rbx
|
||||
mov r10d, ebp
|
||||
mov ebx, r14d
|
||||
xor ebp, 16
|
||||
and ebx, 2097136
|
||||
mov r8, QWORD PTR [r10+rcx]
|
||||
mov r9, QWORD PTR [r10+rcx+8]
|
||||
|
||||
movq xmm0, rsp
|
||||
movq xmm1, rbx
|
||||
movq xmm2, rsi
|
||||
movq xmm11, rdi
|
||||
movq xmm12, rbp
|
||||
movq xmm13, r15
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp]
|
||||
mov esi, DWORD PTR [rsp+4]
|
||||
mov edi, DWORD PTR [rsp+8]
|
||||
mov ebp, DWORD PTR [rsp+12]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
|
||||
xor r8, rax
|
||||
movq xmm3, r8
|
||||
|
||||
movd esp, xmm4
|
||||
pextrd r15d, xmm4, 2
|
||||
movd eax, xmm8
|
||||
movd edx, xmm10
|
||||
pextrd r9d, xmm10, 2
|
||||
|
||||
CryptonightWOW_template_double_part3:
|
||||
|
||||
movq rsp, xmm0
|
||||
mov DWORD PTR [rsp], ebx
|
||||
mov DWORD PTR [rsp+4], esi
|
||||
mov DWORD PTR [rsp+8], edi
|
||||
mov DWORD PTR [rsp+12], ebp
|
||||
|
||||
movq rbx, xmm1
|
||||
movq rsi, xmm2
|
||||
movq rdi, xmm11
|
||||
movq rbp, xmm12
|
||||
movq r15, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rax, r8
|
||||
mul rdi
|
||||
movq xmm1, rdx
|
||||
movq xmm0, rax
|
||||
punpcklqdq xmm1, xmm0
|
||||
mov rdi, rcx
|
||||
mov r8, rax
|
||||
pxor xmm1, XMMWORD PTR [rbp+rcx]
|
||||
xor ebp, 48
|
||||
paddq xmm1, xmm8
|
||||
xor r8, QWORD PTR [rbp+rcx+8]
|
||||
xor rdx, QWORD PTR [rbp+rcx]
|
||||
add r13, r8
|
||||
movdqu xmm2, XMMWORD PTR [rbp+rcx]
|
||||
add r15, rdx
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm1
|
||||
paddq xmm2, xmm4
|
||||
xor ebp, 16
|
||||
mov eax, ebp
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbp+rcx]
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm2
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
movq rax, xmm3
|
||||
movdqa xmm10, xmm8
|
||||
mov QWORD PTR [r10+rcx], r15
|
||||
movdqa xmm8, xmm5
|
||||
xor r15, rax
|
||||
mov QWORD PTR [r10+rcx+8], r13
|
||||
mov r8d, r15d
|
||||
xor r13, r9
|
||||
and r8d, 2097136
|
||||
dec r11d
|
||||
jnz CryptonightWOW_template_double_mainloop
|
||||
|
||||
CryptonightWOW_template_double_part4:
|
||||
|
||||
mov rbx, QWORD PTR [rsp+400]
|
||||
movaps xmm6, XMMWORD PTR [rsp+160]
|
||||
movaps xmm7, XMMWORD PTR [rsp+176]
|
||||
movaps xmm8, XMMWORD PTR [rsp+192]
|
||||
movaps xmm9, XMMWORD PTR [rsp+208]
|
||||
movaps xmm10, XMMWORD PTR [rsp+224]
|
||||
movaps xmm11, XMMWORD PTR [rsp+240]
|
||||
movaps xmm12, XMMWORD PTR [rsp+256]
|
||||
movaps xmm13, XMMWORD PTR [rsp+272]
|
||||
movaps xmm14, XMMWORD PTR [rsp+288]
|
||||
movaps xmm15, XMMWORD PTR [rsp+304]
|
||||
add rsp, 320
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
ret 0
|
||||
CryptonightWOW_template_double_end:
|
||||
413
src/crypto/cn/asm/cn2/cnv2_double_main_loop_sandybridge.inc
Normal file
413
src/crypto/cn/asm/cn2/cnv2_double_main_loop_sandybridge.inc
Normal file
@@ -0,0 +1,413 @@
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 524288
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_double_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_sandybridge
|
||||
div_fix_1_ret_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_sandybridge
|
||||
div_fix_2_ret_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_sandybridge
|
||||
sqrt_fix_1_ret_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_sandybridge
|
||||
sqrt_fix_2_ret_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_sandybridge_endp
|
||||
|
||||
div_fix_1_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_sandybridge
|
||||
|
||||
div_fix_2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_sandybridge
|
||||
|
||||
sqrt_fix_1_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_sandybridge
|
||||
|
||||
sqrt_fix_2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_sandybridge_endp:
|
||||
182
src/crypto/cn/asm/cn2/cnv2_main_loop_bulldozer.inc
Normal file
182
src/crypto/cn/asm/cn2/cnv2_main_loop_bulldozer.inc
Normal file
@@ -0,0 +1,182 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_main_loop_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movq r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movq xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
||||
188
src/crypto/cn/asm/cn2/cnv2_main_loop_ivybridge.inc
Normal file
188
src/crypto/cn/asm/cn2/cnv2_main_loop_ivybridge.inc
Normal file
@@ -0,0 +1,188 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movq xmm5, rax
|
||||
|
||||
xor eax, eax
|
||||
mov QWORD PTR [rsp+16], rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
psubq xmm3, XMMWORD PTR [rsp+16]
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je sqrt_fixup_ivybridge
|
||||
psrlq xmm3, 19
|
||||
sqrt_fixup_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne main_loop_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp cnv2_main_loop_ivybridge_endp
|
||||
|
||||
sqrt_fixup_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp sqrt_fixup_ivybridge_ret
|
||||
|
||||
cnv2_main_loop_ivybridge_endp:
|
||||
181
src/crypto/cn/asm/cn2/cnv2_main_loop_ryzen.inc
Normal file
181
src/crypto/cn/asm/cn2/cnv2_main_loop_ryzen.inc
Normal file
@@ -0,0 +1,181 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne main_loop_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_ryzen_endp
|
||||
|
||||
sqrt_fixup_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_ryzen_ret
|
||||
|
||||
cnv2_main_loop_ryzen_endp:
|
||||
413
src/crypto/cn/asm/cn2/cnv2_rwz_double_main_loop.inc
Normal file
413
src/crypto/cn/asm/cn2/cnv2_rwz_double_main_loop.inc
Normal file
@@ -0,0 +1,413 @@
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 393216
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN(64)
|
||||
rwz_main_loop_double:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm3
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r8+r13], xmm0
|
||||
xor r8d, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js rwz_div_fix_1
|
||||
rwz_div_fix_1_ret:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js rwz_div_fix_2
|
||||
rwz_div_fix_2_ret:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je rwz_sqrt_fix_1
|
||||
rwz_sqrt_fix_1_ret:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je rwz_sqrt_fix_2
|
||||
rwz_sqrt_fix_2_ret:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm3, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm3
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne rwz_main_loop_double
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp rwz_cnv2_double_mainloop_asm_endp
|
||||
|
||||
rwz_div_fix_1:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp rwz_div_fix_1_ret
|
||||
|
||||
rwz_div_fix_2:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp rwz_div_fix_2_ret
|
||||
|
||||
rwz_sqrt_fix_1:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp rwz_sqrt_fix_1_ret
|
||||
|
||||
rwz_sqrt_fix_2:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp rwz_sqrt_fix_2_ret
|
||||
|
||||
rwz_cnv2_double_mainloop_asm_endp:
|
||||
188
src/crypto/cn/asm/cn2/cnv2_rwz_main_loop.inc
Normal file
188
src/crypto/cn/asm/cn2/cnv2_rwz_main_loop.inc
Normal file
@@ -0,0 +1,188 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 393216
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movq xmm5, rax
|
||||
|
||||
xor eax, eax
|
||||
mov QWORD PTR [rsp+16], rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN(64)
|
||||
rwz_main_loop:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm2, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
psubq xmm3, XMMWORD PTR [rsp+16]
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je rwz_sqrt_fixup
|
||||
psrlq xmm3, 19
|
||||
rwz_sqrt_fixup_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm5
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm2
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne rwz_main_loop
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp cnv2_rwz_main_loop_endp
|
||||
|
||||
rwz_sqrt_fixup:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp rwz_sqrt_fixup_ret
|
||||
|
||||
cnv2_rwz_main_loop_endp:
|
||||
73
src/crypto/cn/asm/cn_main_loop.S
Normal file
73
src/crypto/cn/asm/cn_main_loop.S
Normal file
@@ -0,0 +1,73 @@
|
||||
#ifdef __APPLE__
|
||||
# define ALIGN(x) .align 6
|
||||
#else
|
||||
# define ALIGN(x) .align 64
|
||||
#endif
|
||||
.intel_syntax noprefix
|
||||
#ifdef __APPLE__
|
||||
# define FN_PREFIX(fn) _ ## fn
|
||||
.text
|
||||
#else
|
||||
# define FN_PREFIX(fn) fn
|
||||
.section .text
|
||||
#endif
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_rwz_mainloop_asm)
|
||||
.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_main_loop_ivybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_main_loop_ryzen.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_main_loop_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_double_main_loop_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_rwz_mainloop_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_rwz_main_loop.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_rwz_double_mainloop_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_rwz_double_main_loop.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
52
src/crypto/cn/asm/cn_main_loop.asm
Normal file
52
src/crypto/cn/asm/cn_main_loop.asm
Normal file
@@ -0,0 +1,52 @@
|
||||
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||
PUBLIC cnv2_rwz_mainloop_asm
|
||||
PUBLIC cnv2_rwz_double_mainloop_asm
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ivybridge_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ryzen_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_rwz_mainloop_asm PROC
|
||||
INCLUDE cn2/cnv2_rwz_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_rwz_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_rwz_double_mainloop_asm PROC
|
||||
INCLUDE cn2/cnv2_rwz_double_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_rwz_double_mainloop_asm ENDP
|
||||
|
||||
_TEXT_CNV2_MAINLOOP ENDS
|
||||
END
|
||||
281
src/crypto/cn/asm/win64/CryptonightR_soft_aes_template_win.inc
Normal file
281
src/crypto/cn/asm/win64/CryptonightR_soft_aes_template_win.inc
Normal file
@@ -0,0 +1,281 @@
|
||||
PUBLIC CryptonightR_soft_aes_template_part1
|
||||
PUBLIC CryptonightR_soft_aes_template_mainloop
|
||||
PUBLIC CryptonightR_soft_aes_template_part2
|
||||
PUBLIC CryptonightR_soft_aes_template_part3
|
||||
PUBLIC CryptonightR_soft_aes_template_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_soft_aes_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 232
|
||||
|
||||
mov eax, [rcx+96]
|
||||
mov ebx, [rcx+100]
|
||||
mov esi, [rcx+104]
|
||||
mov edx, [rcx+108]
|
||||
mov [rsp+144], eax
|
||||
mov [rsp+148], ebx
|
||||
mov [rsp+152], esi
|
||||
mov [rsp+156], edx
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movd xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movd xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movd xmm5, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movd xmm10, QWORD PTR [r10+96]
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
movd xmm12, r11
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movd xmm13, rcx
|
||||
mov r12d, 524288
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_soft_aes_template_mainloop:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movd xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movd xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+328]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movd r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm2
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
pxor xmm6, xmm1
|
||||
pxor xmm6, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movd rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movd rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov ebp, [rsp+152]
|
||||
add ebx, [rsp+148]
|
||||
add ebp, [rsp+156]
|
||||
shl rbp, 32
|
||||
or rbx, rbp
|
||||
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
|
||||
mov [rsp+160], rbx
|
||||
mov [rsp+168], rdi
|
||||
mov [rsp+176], rbp
|
||||
mov [rsp+184], r10
|
||||
mov r10, rsp
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov esi, [rsp+148]
|
||||
mov edi, [rsp+152]
|
||||
mov ebp, [rsp+156]
|
||||
|
||||
movd esp, xmm7
|
||||
movaps xmm0, xmm7
|
||||
psrldq xmm0, 8
|
||||
movd r15d, xmm0
|
||||
movd eax, xmm4
|
||||
movd edx, xmm5
|
||||
movaps xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movd r9d, xmm0
|
||||
|
||||
CryptonightR_soft_aes_template_part2:
|
||||
mov rsp, r10
|
||||
mov [rsp+144], ebx
|
||||
mov [rsp+148], esi
|
||||
mov [rsp+152], edi
|
||||
mov [rsp+156], ebp
|
||||
|
||||
mov edi, edi
|
||||
shl rbp, 32
|
||||
or rbp, rdi
|
||||
xor r8, rbp
|
||||
|
||||
mov ebx, ebx
|
||||
shl rsi, 32
|
||||
or rsi, rbx
|
||||
xor QWORD PTR [rsp+320], rsi
|
||||
|
||||
mov rbx, [rsp+160]
|
||||
mov rdi, [rsp+168]
|
||||
mov rbp, [rsp+176]
|
||||
mov r10, [rsp+184]
|
||||
|
||||
mov r9, r10
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm2
|
||||
pxor xmm6, xmm1
|
||||
paddq xmm1, xmm7
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm6, xmm0
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+320]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+304]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
sub r12d, 1
|
||||
jne CryptonightR_soft_aes_template_mainloop
|
||||
|
||||
CryptonightR_soft_aes_template_part3:
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 232
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
CryptonightR_soft_aes_template_end:
|
||||
1585
src/crypto/cn/asm/win64/CryptonightR_template.asm
Normal file
1585
src/crypto/cn/asm/win64/CryptonightR_template.asm
Normal file
File diff suppressed because it is too large
Load Diff
536
src/crypto/cn/asm/win64/CryptonightR_template_win.inc
Normal file
536
src/crypto/cn/asm/win64/CryptonightR_template_win.inc
Normal file
@@ -0,0 +1,536 @@
|
||||
PUBLIC CryptonightR_template_part1
|
||||
PUBLIC CryptonightR_template_mainloop
|
||||
PUBLIC CryptonightR_template_part2
|
||||
PUBLIC CryptonightR_template_part3
|
||||
PUBLIC CryptonightR_template_end
|
||||
PUBLIC CryptonightR_template_double_part1
|
||||
PUBLIC CryptonightR_template_double_mainloop
|
||||
PUBLIC CryptonightR_template_double_part2
|
||||
PUBLIC CryptonightR_template_double_part3
|
||||
PUBLIC CryptonightR_template_double_part4
|
||||
PUBLIC CryptonightR_template_double_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
mov r12, rcx
|
||||
mov r8, QWORD PTR [r12+32]
|
||||
mov rdx, r12
|
||||
xor r8, QWORD PTR [r12]
|
||||
mov r15, QWORD PTR [r12+40]
|
||||
mov r9, r8
|
||||
xor r15, QWORD PTR [r12+8]
|
||||
mov r11, QWORD PTR [r12+224]
|
||||
mov r12, QWORD PTR [r12+56]
|
||||
xor r12, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm0, r12
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
movaps XMMWORD PTR [rsp], xmm9
|
||||
mov r12, QWORD PTR [rdx+88]
|
||||
xor r12, QWORD PTR [rdx+72]
|
||||
movd xmm6, rax
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
and r9d, 2097136
|
||||
movd xmm0, r12
|
||||
movd xmm7, rax
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r10d, r9d
|
||||
movd xmm9, rsp
|
||||
mov rsp, r8
|
||||
mov r8d, 524288
|
||||
|
||||
mov ebx, [rdx+96]
|
||||
mov esi, [rdx+100]
|
||||
mov edi, [rdx+104]
|
||||
mov ebp, [rdx+108]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_mainloop:
|
||||
movdqa xmm5, XMMWORD PTR [r9+r11]
|
||||
movd xmm0, r15
|
||||
movd xmm4, rsp
|
||||
punpcklqdq xmm4, xmm0
|
||||
lea rdx, QWORD PTR [r9+r11]
|
||||
|
||||
aesenc xmm5, xmm4
|
||||
|
||||
mov r13d, r9d
|
||||
mov eax, r9d
|
||||
xor r9d, 48
|
||||
xor r13d, 16
|
||||
xor eax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r9+r11]
|
||||
movaps xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [r13+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm5, xmm1
|
||||
pxor xmm5, xmm0
|
||||
|
||||
movd r12, xmm5
|
||||
movd r10d, xmm5
|
||||
and r10d, 2097136
|
||||
|
||||
paddq xmm3, xmm7
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm1, xmm4
|
||||
movdqu XMMWORD PTR [r13+r11], xmm3
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movdqu XMMWORD PTR [r9+r11], xmm1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
|
||||
lea r13d, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or r13, rdx
|
||||
|
||||
movd eax, xmm6
|
||||
movd edx, xmm7
|
||||
pextrd r9d, xmm7, 2
|
||||
|
||||
xor r13, QWORD PTR [r10+r11]
|
||||
mov r14, QWORD PTR [r10+r11+8]
|
||||
|
||||
CryptonightR_template_part2:
|
||||
lea rcx, [r10+r11]
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor rsp, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r15, rax
|
||||
|
||||
mov rax, r13
|
||||
mul r12
|
||||
add r15, rax
|
||||
add rsp, rdx
|
||||
|
||||
mov r9d, r10d
|
||||
mov r12d, r10d
|
||||
xor r9d, 16
|
||||
xor r12d, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [r12+r11]
|
||||
movaps xmm3, xmm1
|
||||
movdqa xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqa xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm5, xmm0
|
||||
pxor xmm5, xmm1
|
||||
paddq xmm3, xmm4
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqu XMMWORD PTR [r12+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm3
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
mov QWORD PTR [rcx], rsp
|
||||
xor rsp, r13
|
||||
mov r9d, esp
|
||||
mov QWORD PTR [rcx+8], r15
|
||||
and r9d, 2097136
|
||||
xor r15, r14
|
||||
movdqa xmm6, xmm5
|
||||
dec r8d
|
||||
jnz CryptonightR_template_mainloop
|
||||
|
||||
CryptonightR_template_part3:
|
||||
movd rsp, xmm9
|
||||
|
||||
mov rbx, QWORD PTR [rsp+136]
|
||||
mov rbp, QWORD PTR [rsp+144]
|
||||
mov rsi, QWORD PTR [rsp+152]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+16]
|
||||
movaps xmm9, XMMWORD PTR [rsp]
|
||||
add rsp, 64
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
ret 0
|
||||
CryptonightR_template_end:
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_double_part1:
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 320
|
||||
mov r14, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r14, QWORD PTR [rcx]
|
||||
mov r12, QWORD PTR [rcx+40]
|
||||
mov ebx, r14d
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
and ebx, 2097136
|
||||
xor r12, QWORD PTR [rcx+8]
|
||||
mov rcx, QWORD PTR [rcx+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [rdx+32]
|
||||
xor r15, QWORD PTR [rdx]
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
xor r13, QWORD PTR [rdx+8]
|
||||
movaps XMMWORD PTR [rsp+160], xmm6
|
||||
movaps XMMWORD PTR [rsp+176], xmm7
|
||||
movaps XMMWORD PTR [rsp+192], xmm8
|
||||
movaps XMMWORD PTR [rsp+208], xmm9
|
||||
movaps XMMWORD PTR [rsp+224], xmm10
|
||||
movaps XMMWORD PTR [rsp+240], xmm11
|
||||
movaps XMMWORD PTR [rsp+256], xmm12
|
||||
movaps XMMWORD PTR [rsp+272], xmm13
|
||||
movaps XMMWORD PTR [rsp+288], xmm14
|
||||
movaps XMMWORD PTR [rsp+304], xmm15
|
||||
movd xmm7, rax
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
|
||||
movaps xmm1, XMMWORD PTR [rdx+96]
|
||||
movaps xmm2, XMMWORD PTR [r8+96]
|
||||
movaps XMMWORD PTR [rsp], xmm1
|
||||
movaps XMMWORD PTR [rsp+16], xmm2
|
||||
|
||||
mov r8d, r15d
|
||||
punpcklqdq xmm7, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movd xmm9, rax
|
||||
mov QWORD PTR [rsp+128], rsi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
punpcklqdq xmm9, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+88]
|
||||
xor rcx, QWORD PTR [rdx+72]
|
||||
movd xmm8, rax
|
||||
mov QWORD PTR [rsp+136], rdi
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm8, xmm0
|
||||
and r8d, 2097136
|
||||
movd xmm0, rcx
|
||||
mov r11d, 524288
|
||||
movd xmm10, rax
|
||||
punpcklqdq xmm10, xmm0
|
||||
|
||||
movd xmm14, QWORD PTR [rsp+128]
|
||||
movd xmm15, QWORD PTR [rsp+136]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightR_template_double_mainloop:
|
||||
movdqu xmm6, XMMWORD PTR [rbx+rsi]
|
||||
movd xmm0, r12
|
||||
mov ecx, ebx
|
||||
movd xmm3, r14
|
||||
punpcklqdq xmm3, xmm0
|
||||
xor ebx, 16
|
||||
aesenc xmm6, xmm3
|
||||
movd xmm4, r15
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm0
|
||||
xor ebx, 48
|
||||
paddq xmm0, xmm7
|
||||
movdqu xmm1, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm1
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm0
|
||||
paddq xmm1, xmm3
|
||||
xor ebx, 16
|
||||
mov eax, ebx
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
pxor xmm6, xmm0
|
||||
movd rdx, xmm6
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm1
|
||||
paddq xmm0, xmm9
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm0
|
||||
mov esi, edx
|
||||
movdqu xmm5, XMMWORD PTR [r8+rdi]
|
||||
and esi, 2097136
|
||||
mov ecx, r8d
|
||||
movd xmm0, r13
|
||||
punpcklqdq xmm4, xmm0
|
||||
xor r8d, 16
|
||||
aesenc xmm5, xmm4
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm0
|
||||
xor r8d, 48
|
||||
paddq xmm0, xmm8
|
||||
movdqu xmm1, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm1
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm0
|
||||
paddq xmm1, xmm4
|
||||
xor r8d, 16
|
||||
mov eax, r8d
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
pxor xmm5, xmm0
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm1
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rdi], xmm0
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rcx+rdi], xmm0
|
||||
movd rdi, xmm5
|
||||
movd rcx, xmm14
|
||||
mov ebp, edi
|
||||
mov r8, QWORD PTR [rcx+rsi]
|
||||
mov r10, QWORD PTR [rcx+rsi+8]
|
||||
lea r9, QWORD PTR [rcx+rsi]
|
||||
xor esi, 16
|
||||
|
||||
movd xmm0, rsp
|
||||
movd xmm1, rsi
|
||||
movd xmm2, rdi
|
||||
movd xmm11, rbp
|
||||
movd xmm12, r15
|
||||
movd xmm13, rdx
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp+16]
|
||||
mov esi, DWORD PTR [rsp+20]
|
||||
mov edi, DWORD PTR [rsp+24]
|
||||
mov ebp, DWORD PTR [rsp+28]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r8, rax
|
||||
|
||||
movd esp, xmm3
|
||||
pextrd r15d, xmm3, 2
|
||||
movd eax, xmm7
|
||||
movd edx, xmm9
|
||||
pextrd r9d, xmm9, 2
|
||||
|
||||
CryptonightR_template_double_part2:
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r14, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r12, rax
|
||||
|
||||
movd rsp, xmm0
|
||||
mov DWORD PTR [rsp+16], ebx
|
||||
mov DWORD PTR [rsp+20], esi
|
||||
mov DWORD PTR [rsp+24], edi
|
||||
mov DWORD PTR [rsp+28], ebp
|
||||
|
||||
movd rsi, xmm1
|
||||
movd rdi, xmm2
|
||||
movd rbp, xmm11
|
||||
movd r15, xmm12
|
||||
movd rdx, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rbx, r8
|
||||
mov rax, r8
|
||||
mul rdx
|
||||
and ebp, 2097136
|
||||
mov r8, rax
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rsi]
|
||||
pxor xmm6, xmm1
|
||||
xor esi, 48
|
||||
paddq xmm1, xmm7
|
||||
movdqu xmm2, XMMWORD PTR [rsi+rcx]
|
||||
pxor xmm6, xmm2
|
||||
paddq xmm2, xmm3
|
||||
movdqu XMMWORD PTR [rsi+rcx], xmm1
|
||||
xor esi, 16
|
||||
mov eax, esi
|
||||
mov rsi, rcx
|
||||
movdqu xmm0, XMMWORD PTR [rax+rcx]
|
||||
pxor xmm6, xmm0
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm2
|
||||
paddq xmm0, xmm9
|
||||
add r12, r8
|
||||
xor rax, 32
|
||||
add r14, rdx
|
||||
movdqa xmm9, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
mov QWORD PTR [r9+8], r12
|
||||
xor r12, r10
|
||||
mov QWORD PTR [r9], r14
|
||||
movd rcx, xmm15
|
||||
xor r14, rbx
|
||||
mov r10d, ebp
|
||||
mov ebx, r14d
|
||||
xor ebp, 16
|
||||
and ebx, 2097136
|
||||
mov r8, QWORD PTR [r10+rcx]
|
||||
mov r9, QWORD PTR [r10+rcx+8]
|
||||
|
||||
movd xmm0, rsp
|
||||
movd xmm1, rbx
|
||||
movd xmm2, rsi
|
||||
movd xmm11, rdi
|
||||
movd xmm12, rbp
|
||||
movd xmm13, r15
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp]
|
||||
mov esi, DWORD PTR [rsp+4]
|
||||
mov edi, DWORD PTR [rsp+8]
|
||||
mov ebp, DWORD PTR [rsp+12]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
|
||||
xor r8, rax
|
||||
movd xmm3, r8
|
||||
|
||||
movd esp, xmm4
|
||||
pextrd r15d, xmm4, 2
|
||||
movd eax, xmm8
|
||||
movd edx, xmm10
|
||||
pextrd r9d, xmm10, 2
|
||||
|
||||
CryptonightR_template_double_part3:
|
||||
|
||||
movd r15, xmm13
|
||||
|
||||
mov eax, edi
|
||||
mov edx, ebp
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r15, rax
|
||||
|
||||
mov eax, ebx
|
||||
mov edx, esi
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r13, rax
|
||||
|
||||
movd rsp, xmm0
|
||||
mov DWORD PTR [rsp], ebx
|
||||
mov DWORD PTR [rsp+4], esi
|
||||
mov DWORD PTR [rsp+8], edi
|
||||
mov DWORD PTR [rsp+12], ebp
|
||||
|
||||
movd rbx, xmm1
|
||||
movd rsi, xmm2
|
||||
movd rdi, xmm11
|
||||
movd rbp, xmm12
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rax, r8
|
||||
mul rdi
|
||||
mov rdi, rcx
|
||||
mov r8, rax
|
||||
movdqu xmm1, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm1
|
||||
xor ebp, 48
|
||||
paddq xmm1, xmm8
|
||||
add r13, r8
|
||||
movdqu xmm2, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm2
|
||||
add r15, rdx
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm1
|
||||
paddq xmm2, xmm4
|
||||
xor ebp, 16
|
||||
mov eax, ebp
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbp+rcx]
|
||||
pxor xmm5, xmm0
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm2
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
movd rax, xmm3
|
||||
movdqa xmm10, xmm8
|
||||
mov QWORD PTR [r10+rcx], r15
|
||||
movdqa xmm8, xmm5
|
||||
xor r15, rax
|
||||
mov QWORD PTR [r10+rcx+8], r13
|
||||
mov r8d, r15d
|
||||
xor r13, r9
|
||||
and r8d, 2097136
|
||||
dec r11d
|
||||
jnz CryptonightR_template_double_mainloop
|
||||
|
||||
CryptonightR_template_double_part4:
|
||||
|
||||
mov rbx, QWORD PTR [rsp+400]
|
||||
movaps xmm6, XMMWORD PTR [rsp+160]
|
||||
movaps xmm7, XMMWORD PTR [rsp+176]
|
||||
movaps xmm8, XMMWORD PTR [rsp+192]
|
||||
movaps xmm9, XMMWORD PTR [rsp+208]
|
||||
movaps xmm10, XMMWORD PTR [rsp+224]
|
||||
movaps xmm11, XMMWORD PTR [rsp+240]
|
||||
movaps xmm12, XMMWORD PTR [rsp+256]
|
||||
movaps xmm13, XMMWORD PTR [rsp+272]
|
||||
movaps xmm14, XMMWORD PTR [rsp+288]
|
||||
movaps xmm15, XMMWORD PTR [rsp+304]
|
||||
add rsp, 320
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
ret 0
|
||||
CryptonightR_template_double_end:
|
||||
268
src/crypto/cn/asm/win64/CryptonightWOW_soft_aes_template_win.inc
Normal file
268
src/crypto/cn/asm/win64/CryptonightWOW_soft_aes_template_win.inc
Normal file
@@ -0,0 +1,268 @@
|
||||
PUBLIC CryptonightWOW_soft_aes_template_part1
|
||||
PUBLIC CryptonightWOW_soft_aes_template_mainloop
|
||||
PUBLIC CryptonightWOW_soft_aes_template_part2
|
||||
PUBLIC CryptonightWOW_soft_aes_template_part3
|
||||
PUBLIC CryptonightWOW_soft_aes_template_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_soft_aes_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 232
|
||||
|
||||
mov eax, [rcx+96]
|
||||
mov ebx, [rcx+100]
|
||||
mov esi, [rcx+104]
|
||||
mov edx, [rcx+108]
|
||||
mov [rsp+144], eax
|
||||
mov [rsp+148], ebx
|
||||
mov [rsp+152], esi
|
||||
mov [rsp+156], edx
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movd xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movd xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movd xmm5, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movd xmm10, QWORD PTR [r10+96]
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
movd xmm12, r11
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movd xmm13, rcx
|
||||
mov r12d, 524288
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_soft_aes_template_mainloop:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movd xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movd xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+328]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movd r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movd rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movd rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov ebp, [rsp+152]
|
||||
add ebx, [rsp+148]
|
||||
add ebp, [rsp+156]
|
||||
shl rbp, 32
|
||||
or rbx, rbp
|
||||
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
|
||||
mov [rsp+160], rbx
|
||||
mov [rsp+168], rdi
|
||||
mov [rsp+176], rbp
|
||||
mov [rsp+184], r10
|
||||
mov r10, rsp
|
||||
|
||||
mov ebx, [rsp+144]
|
||||
mov esi, [rsp+148]
|
||||
mov edi, [rsp+152]
|
||||
mov ebp, [rsp+156]
|
||||
|
||||
movd esp, xmm7
|
||||
movaps xmm0, xmm7
|
||||
psrldq xmm0, 8
|
||||
movd r15d, xmm0
|
||||
movd eax, xmm4
|
||||
movd edx, xmm5
|
||||
|
||||
CryptonightWOW_soft_aes_template_part2:
|
||||
mov rsp, r10
|
||||
mov [rsp+144], ebx
|
||||
mov [rsp+148], esi
|
||||
mov [rsp+152], edi
|
||||
mov [rsp+156], ebp
|
||||
|
||||
mov rbx, [rsp+160]
|
||||
mov rdi, [rsp+168]
|
||||
mov rbp, [rsp+176]
|
||||
mov r10, [rsp+184]
|
||||
|
||||
mov r9, r10
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movd xmm0, rax
|
||||
movd xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+320]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+304]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+320], r9
|
||||
mov QWORD PTR [rsp+328], rax
|
||||
sub r12d, 1
|
||||
jne CryptonightWOW_soft_aes_template_mainloop
|
||||
|
||||
CryptonightWOW_soft_aes_template_part3:
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 232
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
CryptonightWOW_soft_aes_template_end:
|
||||
491
src/crypto/cn/asm/win64/CryptonightWOW_template_win.inc
Normal file
491
src/crypto/cn/asm/win64/CryptonightWOW_template_win.inc
Normal file
@@ -0,0 +1,491 @@
|
||||
PUBLIC CryptonightWOW_template_part1
|
||||
PUBLIC CryptonightWOW_template_mainloop
|
||||
PUBLIC CryptonightWOW_template_part2
|
||||
PUBLIC CryptonightWOW_template_part3
|
||||
PUBLIC CryptonightWOW_template_end
|
||||
PUBLIC CryptonightWOW_template_double_part1
|
||||
PUBLIC CryptonightWOW_template_double_mainloop
|
||||
PUBLIC CryptonightWOW_template_double_part2
|
||||
PUBLIC CryptonightWOW_template_double_part3
|
||||
PUBLIC CryptonightWOW_template_double_part4
|
||||
PUBLIC CryptonightWOW_template_double_end
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_part1:
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
mov r12, rcx
|
||||
mov r8, QWORD PTR [r12+32]
|
||||
mov rdx, r12
|
||||
xor r8, QWORD PTR [r12]
|
||||
mov r15, QWORD PTR [r12+40]
|
||||
mov r9, r8
|
||||
xor r15, QWORD PTR [r12+8]
|
||||
mov r11, QWORD PTR [r12+224]
|
||||
mov r12, QWORD PTR [r12+56]
|
||||
xor r12, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm0, r12
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
movaps XMMWORD PTR [rsp], xmm9
|
||||
mov r12, QWORD PTR [rdx+88]
|
||||
xor r12, QWORD PTR [rdx+72]
|
||||
movd xmm6, rax
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
and r9d, 2097136
|
||||
movd xmm0, r12
|
||||
movd xmm7, rax
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r10d, r9d
|
||||
movd xmm9, rsp
|
||||
mov rsp, r8
|
||||
mov r8d, 524288
|
||||
|
||||
mov ebx, [rdx+96]
|
||||
mov esi, [rdx+100]
|
||||
mov edi, [rdx+104]
|
||||
mov ebp, [rdx+108]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_mainloop:
|
||||
movdqa xmm5, XMMWORD PTR [r9+r11]
|
||||
movd xmm0, r15
|
||||
movd xmm4, rsp
|
||||
punpcklqdq xmm4, xmm0
|
||||
lea rdx, QWORD PTR [r9+r11]
|
||||
|
||||
aesenc xmm5, xmm4
|
||||
movd r10d, xmm5
|
||||
and r10d, 2097136
|
||||
|
||||
mov r12d, r9d
|
||||
mov eax, r9d
|
||||
xor r9d, 48
|
||||
xor r12d, 16
|
||||
xor eax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm2, XMMWORD PTR [r12+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm2, xmm6
|
||||
paddq xmm1, xmm4
|
||||
movdqu XMMWORD PTR [r12+r11], xmm0
|
||||
movd r12, xmm5
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movdqu XMMWORD PTR [r9+r11], xmm1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
|
||||
lea r13d, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or r13, rdx
|
||||
|
||||
xor r13, QWORD PTR [r10+r11]
|
||||
mov r14, QWORD PTR [r10+r11+8]
|
||||
|
||||
movd eax, xmm6
|
||||
movd edx, xmm7
|
||||
pextrd r9d, xmm7, 2
|
||||
|
||||
CryptonightWOW_template_part2:
|
||||
mov rax, r13
|
||||
mul r12
|
||||
movd xmm0, rax
|
||||
movd xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
mov r12d, r10d
|
||||
xor r9d, 16
|
||||
xor r12d, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [r12+r11]
|
||||
xor rdx, QWORD PTR [r12+r11]
|
||||
xor rax, QWORD PTR [r11+r12+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+r11]
|
||||
pxor xmm3, xmm2
|
||||
paddq xmm7, XMMWORD PTR [r10+r11]
|
||||
paddq xmm1, xmm4
|
||||
paddq xmm3, xmm6
|
||||
movdqu XMMWORD PTR [r9+r11], xmm7
|
||||
movdqu XMMWORD PTR [r12+r11], xmm3
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
add r15, rax
|
||||
add rsp, rdx
|
||||
xor r10, 48
|
||||
mov QWORD PTR [r10+r11], rsp
|
||||
xor rsp, r13
|
||||
mov r9d, esp
|
||||
mov QWORD PTR [r10+r11+8], r15
|
||||
and r9d, 2097136
|
||||
xor r15, r14
|
||||
movdqa xmm6, xmm5
|
||||
dec r8d
|
||||
jnz CryptonightWOW_template_mainloop
|
||||
|
||||
CryptonightWOW_template_part3:
|
||||
movd rsp, xmm9
|
||||
|
||||
mov rbx, QWORD PTR [rsp+136]
|
||||
mov rbp, QWORD PTR [rsp+144]
|
||||
mov rsi, QWORD PTR [rsp+152]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+16]
|
||||
movaps xmm9, XMMWORD PTR [rsp]
|
||||
add rsp, 64
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
ret 0
|
||||
CryptonightWOW_template_end:
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_double_part1:
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 320
|
||||
mov r14, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r14, QWORD PTR [rcx]
|
||||
mov r12, QWORD PTR [rcx+40]
|
||||
mov ebx, r14d
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
and ebx, 2097136
|
||||
xor r12, QWORD PTR [rcx+8]
|
||||
mov rcx, QWORD PTR [rcx+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [rdx+32]
|
||||
xor r15, QWORD PTR [rdx]
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
xor r13, QWORD PTR [rdx+8]
|
||||
movaps XMMWORD PTR [rsp+160], xmm6
|
||||
movaps XMMWORD PTR [rsp+176], xmm7
|
||||
movaps XMMWORD PTR [rsp+192], xmm8
|
||||
movaps XMMWORD PTR [rsp+208], xmm9
|
||||
movaps XMMWORD PTR [rsp+224], xmm10
|
||||
movaps XMMWORD PTR [rsp+240], xmm11
|
||||
movaps XMMWORD PTR [rsp+256], xmm12
|
||||
movaps XMMWORD PTR [rsp+272], xmm13
|
||||
movaps XMMWORD PTR [rsp+288], xmm14
|
||||
movaps XMMWORD PTR [rsp+304], xmm15
|
||||
movd xmm7, rax
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
|
||||
movaps xmm1, XMMWORD PTR [rdx+96]
|
||||
movaps xmm2, XMMWORD PTR [r8+96]
|
||||
movaps XMMWORD PTR [rsp], xmm1
|
||||
movaps XMMWORD PTR [rsp+16], xmm2
|
||||
|
||||
mov r8d, r15d
|
||||
punpcklqdq xmm7, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movd xmm9, rax
|
||||
mov QWORD PTR [rsp+128], rsi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
punpcklqdq xmm9, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [rdx+88]
|
||||
xor rcx, QWORD PTR [rdx+72]
|
||||
movd xmm8, rax
|
||||
mov QWORD PTR [rsp+136], rdi
|
||||
mov rax, QWORD PTR [rdx+80]
|
||||
xor rax, QWORD PTR [rdx+64]
|
||||
punpcklqdq xmm8, xmm0
|
||||
and r8d, 2097136
|
||||
movd xmm0, rcx
|
||||
mov r11d, 524288
|
||||
movd xmm10, rax
|
||||
punpcklqdq xmm10, xmm0
|
||||
|
||||
movd xmm14, QWORD PTR [rsp+128]
|
||||
movd xmm15, QWORD PTR [rsp+136]
|
||||
|
||||
ALIGN(64)
|
||||
CryptonightWOW_template_double_mainloop:
|
||||
movdqu xmm6, XMMWORD PTR [rbx+rsi]
|
||||
movd xmm0, r12
|
||||
mov ecx, ebx
|
||||
movd xmm3, r14
|
||||
punpcklqdq xmm3, xmm0
|
||||
xor ebx, 16
|
||||
aesenc xmm6, xmm3
|
||||
movd rdx, xmm6
|
||||
movd xmm4, r15
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
xor ebx, 48
|
||||
paddq xmm0, xmm7
|
||||
movdqu xmm1, XMMWORD PTR [rbx+rsi]
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm0
|
||||
paddq xmm1, xmm3
|
||||
xor ebx, 16
|
||||
mov eax, ebx
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbx+rsi]
|
||||
movdqu XMMWORD PTR [rbx+rsi], xmm1
|
||||
paddq xmm0, xmm9
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm0
|
||||
mov esi, edx
|
||||
movdqu xmm5, XMMWORD PTR [r8+rdi]
|
||||
and esi, 2097136
|
||||
mov ecx, r8d
|
||||
movd xmm0, r13
|
||||
punpcklqdq xmm4, xmm0
|
||||
xor r8d, 16
|
||||
aesenc xmm5, xmm4
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
xor r8d, 48
|
||||
paddq xmm0, xmm8
|
||||
movdqu xmm1, XMMWORD PTR [r8+rdi]
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm0
|
||||
paddq xmm1, xmm4
|
||||
xor r8d, 16
|
||||
mov eax, r8d
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+rdi]
|
||||
movdqu XMMWORD PTR [r8+rdi], xmm1
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rdi], xmm0
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rcx+rdi], xmm0
|
||||
movd rdi, xmm5
|
||||
movd rcx, xmm14
|
||||
mov ebp, edi
|
||||
mov r8, QWORD PTR [rcx+rsi]
|
||||
mov r10, QWORD PTR [rcx+rsi+8]
|
||||
lea r9, QWORD PTR [rcx+rsi]
|
||||
xor esi, 16
|
||||
|
||||
movd xmm0, rsp
|
||||
movd xmm1, rsi
|
||||
movd xmm2, rdi
|
||||
movd xmm11, rbp
|
||||
movd xmm12, r15
|
||||
movd xmm13, rdx
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp+16]
|
||||
mov esi, DWORD PTR [rsp+20]
|
||||
mov edi, DWORD PTR [rsp+24]
|
||||
mov ebp, DWORD PTR [rsp+28]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
xor r8, rax
|
||||
|
||||
movd esp, xmm3
|
||||
pextrd r15d, xmm3, 2
|
||||
movd eax, xmm7
|
||||
movd edx, xmm9
|
||||
pextrd r9d, xmm9, 2
|
||||
|
||||
CryptonightWOW_template_double_part2:
|
||||
|
||||
movd rsp, xmm0
|
||||
mov DWORD PTR [rsp+16], ebx
|
||||
mov DWORD PTR [rsp+20], esi
|
||||
mov DWORD PTR [rsp+24], edi
|
||||
mov DWORD PTR [rsp+28], ebp
|
||||
|
||||
movd rsi, xmm1
|
||||
movd rdi, xmm2
|
||||
movd rbp, xmm11
|
||||
movd r15, xmm12
|
||||
movd rdx, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rbx, r8
|
||||
mov rax, r8
|
||||
mul rdx
|
||||
and ebp, 2097136
|
||||
mov r8, rax
|
||||
movd xmm1, rdx
|
||||
movd xmm0, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
pxor xmm1, XMMWORD PTR [rcx+rsi]
|
||||
xor esi, 48
|
||||
paddq xmm1, xmm7
|
||||
movdqu xmm2, XMMWORD PTR [rsi+rcx]
|
||||
xor rdx, QWORD PTR [rsi+rcx]
|
||||
paddq xmm2, xmm3
|
||||
xor r8, QWORD PTR [rsi+rcx+8]
|
||||
movdqu XMMWORD PTR [rsi+rcx], xmm1
|
||||
xor esi, 16
|
||||
mov eax, esi
|
||||
mov rsi, rcx
|
||||
movdqu xmm0, XMMWORD PTR [rax+rcx]
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm2
|
||||
paddq xmm0, xmm9
|
||||
add r12, r8
|
||||
xor rax, 32
|
||||
add r14, rdx
|
||||
movdqa xmm9, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
mov QWORD PTR [r9+8], r12
|
||||
xor r12, r10
|
||||
mov QWORD PTR [r9], r14
|
||||
movd rcx, xmm15
|
||||
xor r14, rbx
|
||||
mov r10d, ebp
|
||||
mov ebx, r14d
|
||||
xor ebp, 16
|
||||
and ebx, 2097136
|
||||
mov r8, QWORD PTR [r10+rcx]
|
||||
mov r9, QWORD PTR [r10+rcx+8]
|
||||
|
||||
movd xmm0, rsp
|
||||
movd xmm1, rbx
|
||||
movd xmm2, rsi
|
||||
movd xmm11, rdi
|
||||
movd xmm12, rbp
|
||||
movd xmm13, r15
|
||||
mov [rsp+104], rcx
|
||||
mov [rsp+112], r9
|
||||
|
||||
mov ebx, DWORD PTR [rsp]
|
||||
mov esi, DWORD PTR [rsp+4]
|
||||
mov edi, DWORD PTR [rsp+8]
|
||||
mov ebp, DWORD PTR [rsp+12]
|
||||
|
||||
lea eax, [ebx+esi]
|
||||
lea edx, [edi+ebp]
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
|
||||
xor r8, rax
|
||||
movd xmm3, r8
|
||||
|
||||
movd esp, xmm4
|
||||
pextrd r15d, xmm4, 2
|
||||
movd eax, xmm8
|
||||
movd edx, xmm10
|
||||
pextrd r9d, xmm10, 2
|
||||
|
||||
CryptonightWOW_template_double_part3:
|
||||
|
||||
movd rsp, xmm0
|
||||
mov DWORD PTR [rsp], ebx
|
||||
mov DWORD PTR [rsp+4], esi
|
||||
mov DWORD PTR [rsp+8], edi
|
||||
mov DWORD PTR [rsp+12], ebp
|
||||
|
||||
movd rbx, xmm1
|
||||
movd rsi, xmm2
|
||||
movd rdi, xmm11
|
||||
movd rbp, xmm12
|
||||
movd r15, xmm13
|
||||
mov rcx, [rsp+104]
|
||||
mov r9, [rsp+112]
|
||||
|
||||
mov rax, r8
|
||||
mul rdi
|
||||
movd xmm1, rdx
|
||||
movd xmm0, rax
|
||||
punpcklqdq xmm1, xmm0
|
||||
mov rdi, rcx
|
||||
mov r8, rax
|
||||
pxor xmm1, XMMWORD PTR [rbp+rcx]
|
||||
xor ebp, 48
|
||||
paddq xmm1, xmm8
|
||||
xor r8, QWORD PTR [rbp+rcx+8]
|
||||
xor rdx, QWORD PTR [rbp+rcx]
|
||||
add r13, r8
|
||||
movdqu xmm2, XMMWORD PTR [rbp+rcx]
|
||||
add r15, rdx
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm1
|
||||
paddq xmm2, xmm4
|
||||
xor ebp, 16
|
||||
mov eax, ebp
|
||||
xor rax, 32
|
||||
movdqu xmm0, XMMWORD PTR [rbp+rcx]
|
||||
movdqu XMMWORD PTR [rbp+rcx], xmm2
|
||||
paddq xmm0, xmm10
|
||||
movdqu XMMWORD PTR [rax+rcx], xmm0
|
||||
movd rax, xmm3
|
||||
movdqa xmm10, xmm8
|
||||
mov QWORD PTR [r10+rcx], r15
|
||||
movdqa xmm8, xmm5
|
||||
xor r15, rax
|
||||
mov QWORD PTR [r10+rcx+8], r13
|
||||
mov r8d, r15d
|
||||
xor r13, r9
|
||||
and r8d, 2097136
|
||||
dec r11d
|
||||
jnz CryptonightWOW_template_double_mainloop
|
||||
|
||||
CryptonightWOW_template_double_part4:
|
||||
|
||||
mov rbx, QWORD PTR [rsp+400]
|
||||
movaps xmm6, XMMWORD PTR [rsp+160]
|
||||
movaps xmm7, XMMWORD PTR [rsp+176]
|
||||
movaps xmm8, XMMWORD PTR [rsp+192]
|
||||
movaps xmm9, XMMWORD PTR [rsp+208]
|
||||
movaps xmm10, XMMWORD PTR [rsp+224]
|
||||
movaps xmm11, XMMWORD PTR [rsp+240]
|
||||
movaps xmm12, XMMWORD PTR [rsp+256]
|
||||
movaps xmm13, XMMWORD PTR [rsp+272]
|
||||
movaps xmm14, XMMWORD PTR [rsp+288]
|
||||
movaps xmm15, XMMWORD PTR [rsp+304]
|
||||
add rsp, 320
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
ret 0
|
||||
CryptonightWOW_template_double_end:
|
||||
@@ -0,0 +1,413 @@
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 524288
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movd xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movd xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movd xmm5, QWORD PTR [r8+104]
|
||||
movd xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movd xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movd xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movd xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movd xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movd xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movd xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movd xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movd xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_double_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movd xmm0, r11
|
||||
movd xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movd r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movd xmm0, rbp
|
||||
movd xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movd rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movd rdx, xmm5
|
||||
shl rdx, 32
|
||||
movd rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movd xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movd xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movd r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movd r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movd r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movd rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movd rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movd r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_sandybridge
|
||||
div_fix_1_ret_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_sandybridge
|
||||
div_fix_2_ret_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movd r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_sandybridge
|
||||
sqrt_fix_1_ret_sandybridge:
|
||||
|
||||
movd r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movd r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_sandybridge
|
||||
sqrt_fix_2_ret_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movd xmm0, rax
|
||||
movd xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_sandybridge_endp
|
||||
|
||||
div_fix_1_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_sandybridge
|
||||
|
||||
div_fix_2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_sandybridge
|
||||
|
||||
sqrt_fix_1_sandybridge:
|
||||
movd r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movd xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_sandybridge
|
||||
|
||||
sqrt_fix_2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movd r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movd xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_sandybridge_endp:
|
||||
182
src/crypto/cn/asm/win64/cn2/cnv2_main_loop_bulldozer.inc
Normal file
182
src/crypto/cn/asm/win64/cn2/cnv2_main_loop_bulldozer.inc
Normal file
@@ -0,0 +1,182 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_main_loop_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movd r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movd xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
movd xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
movd r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
||||
188
src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ivybridge.inc
Normal file
188
src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ivybridge.inc
Normal file
@@ -0,0 +1,188 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movd xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movd xmm5, rax
|
||||
|
||||
xor eax, eax
|
||||
mov QWORD PTR [rsp+16], rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movd xmm0, r11
|
||||
movd xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movd rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movd rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movd rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movd xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
psubq xmm3, XMMWORD PTR [rsp+16]
|
||||
movd rdx, xmm3
|
||||
test edx, 524287
|
||||
je sqrt_fixup_ivybridge
|
||||
psrlq xmm3, 19
|
||||
sqrt_fixup_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movd xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movd xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne main_loop_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp cnv2_main_loop_ivybridge_endp
|
||||
|
||||
sqrt_fixup_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movd xmm3, rdx
|
||||
jmp sqrt_fixup_ivybridge_ret
|
||||
|
||||
cnv2_main_loop_ivybridge_endp:
|
||||
181
src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ryzen.inc
Normal file
181
src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ryzen.inc
Normal file
@@ -0,0 +1,181 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm0, r11
|
||||
movd xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movd r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movd rax, xmm0
|
||||
|
||||
div r9
|
||||
movd xmm0, rax
|
||||
movd xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movd r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
movd xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne main_loop_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_ryzen_endp
|
||||
|
||||
sqrt_fixup_ryzen:
|
||||
movd r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_ryzen_ret
|
||||
|
||||
cnv2_main_loop_ryzen_endp:
|
||||
413
src/crypto/cn/asm/win64/cn2/cnv2_rwz_double_main_loop.inc
Normal file
413
src/crypto/cn/asm/win64/cn2/cnv2_rwz_double_main_loop.inc
Normal file
@@ -0,0 +1,413 @@
|
||||
mov rdx, [rcx+8]
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 393216
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movd xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movd xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movd xmm5, QWORD PTR [r8+104]
|
||||
movd xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movd xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movd xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movd xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movd xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movd xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movd xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movd xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movd xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN(64)
|
||||
rwz_main_loop_double:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movd xmm0, r11
|
||||
movd xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movd r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movd xmm0, rbp
|
||||
movd xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movd rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movd rdx, xmm5
|
||||
shl rdx, 32
|
||||
movd rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movd xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movd xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm3
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r8+r13], xmm0
|
||||
xor r8d, 32
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movd r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movd r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movd r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movd rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movd rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movd r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js rwz_div_fix_1
|
||||
rwz_div_fix_1_ret:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js rwz_div_fix_2
|
||||
rwz_div_fix_2_ret:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movd r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je rwz_sqrt_fix_1
|
||||
rwz_sqrt_fix_1_ret:
|
||||
|
||||
movd r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movd r8, xmm1
|
||||
test r8, 524287
|
||||
je rwz_sqrt_fix_2
|
||||
rwz_sqrt_fix_2_ret:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movd xmm0, rax
|
||||
movd xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm3, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm3
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne rwz_main_loop_double
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp rwz_cnv2_double_mainloop_asm_endp
|
||||
|
||||
rwz_div_fix_1:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp rwz_div_fix_1_ret
|
||||
|
||||
rwz_div_fix_2:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp rwz_div_fix_2_ret
|
||||
|
||||
rwz_sqrt_fix_1:
|
||||
movd r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movd xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp rwz_sqrt_fix_1_ret
|
||||
|
||||
rwz_sqrt_fix_2:
|
||||
psrldq xmm3, 8
|
||||
movd r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movd xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp rwz_sqrt_fix_2_ret
|
||||
|
||||
rwz_cnv2_double_mainloop_asm_endp:
|
||||
188
src/crypto/cn/asm/win64/cn2/cnv2_rwz_main_loop.inc
Normal file
188
src/crypto/cn/asm/win64/cn2/cnv2_rwz_main_loop.inc
Normal file
@@ -0,0 +1,188 @@
|
||||
mov rcx, [rcx]
|
||||
|
||||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 393216
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movd xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movd xmm5, rax
|
||||
|
||||
xor eax, eax
|
||||
mov QWORD PTR [rsp+16], rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN(64)
|
||||
rwz_main_loop:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movd xmm0, r11
|
||||
movd xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movd rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm2, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movd rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movd rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movd xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
psubq xmm3, XMMWORD PTR [rsp+16]
|
||||
movd rdx, xmm3
|
||||
test edx, 524287
|
||||
je rwz_sqrt_fixup
|
||||
psrlq xmm3, 19
|
||||
rwz_sqrt_fixup_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movd xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movd xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm5
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm2
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne rwz_main_loop
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp cnv2_rwz_main_loop_endp
|
||||
|
||||
rwz_sqrt_fixup:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movd xmm3, rdx
|
||||
jmp rwz_sqrt_fixup_ret
|
||||
|
||||
cnv2_rwz_main_loop_endp:
|
||||
45
src/crypto/cn/asm/win64/cn_main_loop.S
Normal file
45
src/crypto/cn/asm/win64/cn_main_loop.S
Normal file
@@ -0,0 +1,45 @@
|
||||
#define ALIGN(x) .align 64
|
||||
.intel_syntax noprefix
|
||||
.section .text
|
||||
.global cnv2_mainloop_ivybridge_asm
|
||||
.global cnv2_mainloop_ryzen_asm
|
||||
.global cnv2_mainloop_bulldozer_asm
|
||||
.global cnv2_double_mainloop_sandybridge_asm
|
||||
.global cnv2_rwz_mainloop_asm
|
||||
.global cnv2_rwz_double_mainloop_asm
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm:
|
||||
#include "../cn2/cnv2_main_loop_ivybridge.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ryzen_asm:
|
||||
#include "../cn2/cnv2_main_loop_ryzen.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_bulldozer_asm:
|
||||
#include "../cn2/cnv2_main_loop_bulldozer.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_double_mainloop_sandybridge_asm:
|
||||
#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_rwz_mainloop_asm:
|
||||
#include "cn2/cnv2_rwz_main_loop.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_rwz_double_mainloop_asm:
|
||||
#include "cn2/cnv2_rwz_double_main_loop.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
52
src/crypto/cn/asm/win64/cn_main_loop.asm
Normal file
52
src/crypto/cn/asm/win64/cn_main_loop.asm
Normal file
@@ -0,0 +1,52 @@
|
||||
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||
PUBLIC cnv2_rwz_mainloop_asm
|
||||
PUBLIC cnv2_rwz_double_mainloop_asm
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_rwz_mainloop_asm PROC
|
||||
INCLUDE cn2/cnv2_rwz_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_rwz_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_rwz_double_mainloop_asm PROC
|
||||
INCLUDE cn2/cnv2_rwz_double_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_rwz_double_mainloop_asm ENDP
|
||||
|
||||
_TEXT_CNV2_MAINLOOP ENDS
|
||||
END
|
||||
326
src/crypto/cn/c_blake256.c
Normal file
326
src/crypto/cn/c_blake256.c
Normal file
@@ -0,0 +1,326 @@
|
||||
/*
|
||||
* The blake256_* and blake224_* functions are largely copied from
|
||||
* blake256_light.c and blake224_light.c from the BLAKE website:
|
||||
*
|
||||
* http://131002.net/blake/
|
||||
*
|
||||
* The hmac_* functions implement HMAC-BLAKE-256 and HMAC-BLAKE-224.
|
||||
* HMAC is specified by RFC 2104.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include "c_blake256.h"
|
||||
|
||||
#define U8TO32(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
|
||||
#define U32TO8(p, v) \
|
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
|
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
|
||||
|
||||
const uint8_t sigma[][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
|
||||
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
|
||||
{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
|
||||
{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8},
|
||||
{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13},
|
||||
{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9},
|
||||
{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11},
|
||||
{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10},
|
||||
{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5},
|
||||
{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0},
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
|
||||
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
|
||||
{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
|
||||
{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}
|
||||
};
|
||||
|
||||
const uint32_t cst[16] = {
|
||||
0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
|
||||
0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
|
||||
0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
|
||||
0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
|
||||
};
|
||||
|
||||
static const uint8_t padding[] = {
|
||||
0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
};
|
||||
|
||||
|
||||
void blake256_compress(state *S, const uint8_t *block) {
|
||||
uint32_t v[16], m[16], i;
|
||||
|
||||
#define ROT(x,n) (((x)<<(32-n))|((x)>>(n)))
|
||||
#define G(a,b,c,d,e) \
|
||||
v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \
|
||||
v[d] = ROT(v[d] ^ v[a],16); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROT(v[b] ^ v[c],12); \
|
||||
v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b]; \
|
||||
v[d] = ROT(v[d] ^ v[a], 8); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROT(v[b] ^ v[c], 7);
|
||||
|
||||
for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
|
||||
for (i = 0; i < 8; ++i) v[i] = S->h[i];
|
||||
v[ 8] = S->s[0] ^ 0x243F6A88;
|
||||
v[ 9] = S->s[1] ^ 0x85A308D3;
|
||||
v[10] = S->s[2] ^ 0x13198A2E;
|
||||
v[11] = S->s[3] ^ 0x03707344;
|
||||
v[12] = 0xA4093822;
|
||||
v[13] = 0x299F31D0;
|
||||
v[14] = 0x082EFA98;
|
||||
v[15] = 0xEC4E6C89;
|
||||
|
||||
if (S->nullt == 0) {
|
||||
v[12] ^= S->t[0];
|
||||
v[13] ^= S->t[0];
|
||||
v[14] ^= S->t[1];
|
||||
v[15] ^= S->t[1];
|
||||
}
|
||||
|
||||
for (i = 0; i < 14; ++i) {
|
||||
G(0, 4, 8, 12, 0);
|
||||
G(1, 5, 9, 13, 2);
|
||||
G(2, 6, 10, 14, 4);
|
||||
G(3, 7, 11, 15, 6);
|
||||
G(3, 4, 9, 14, 14);
|
||||
G(2, 7, 8, 13, 12);
|
||||
G(0, 5, 10, 15, 8);
|
||||
G(1, 6, 11, 12, 10);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
|
||||
for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4];
|
||||
}
|
||||
|
||||
void blake256_init(state *S) {
|
||||
S->h[0] = 0x6A09E667;
|
||||
S->h[1] = 0xBB67AE85;
|
||||
S->h[2] = 0x3C6EF372;
|
||||
S->h[3] = 0xA54FF53A;
|
||||
S->h[4] = 0x510E527F;
|
||||
S->h[5] = 0x9B05688C;
|
||||
S->h[6] = 0x1F83D9AB;
|
||||
S->h[7] = 0x5BE0CD19;
|
||||
S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
|
||||
S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
|
||||
}
|
||||
|
||||
void blake224_init(state *S) {
|
||||
S->h[0] = 0xC1059ED8;
|
||||
S->h[1] = 0x367CD507;
|
||||
S->h[2] = 0x3070DD17;
|
||||
S->h[3] = 0xF70E5939;
|
||||
S->h[4] = 0xFFC00B31;
|
||||
S->h[5] = 0x68581511;
|
||||
S->h[6] = 0x64F98FA7;
|
||||
S->h[7] = 0xBEFA4FA4;
|
||||
S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
|
||||
S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
|
||||
}
|
||||
|
||||
// datalen = number of bits
|
||||
void blake256_update(state *S, const uint8_t *data, uint64_t datalen) {
|
||||
int left = S->buflen >> 3;
|
||||
int fill = 64 - left;
|
||||
|
||||
if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
|
||||
memcpy((void *) (S->buf + left), (void *) data, fill);
|
||||
S->t[0] += 512;
|
||||
if (S->t[0] == 0) S->t[1]++;
|
||||
blake256_compress(S, S->buf);
|
||||
data += fill;
|
||||
datalen -= (fill << 3);
|
||||
left = 0;
|
||||
}
|
||||
|
||||
while (datalen >= 512) {
|
||||
S->t[0] += 512;
|
||||
if (S->t[0] == 0) S->t[1]++;
|
||||
blake256_compress(S, data);
|
||||
data += 64;
|
||||
datalen -= 512;
|
||||
}
|
||||
|
||||
if (datalen > 0) {
|
||||
memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
|
||||
S->buflen = (left << 3) + (int) datalen;
|
||||
} else {
|
||||
S->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// datalen = number of bits
|
||||
void blake224_update(state *S, const uint8_t *data, uint64_t datalen) {
|
||||
blake256_update(S, data, datalen);
|
||||
}
|
||||
|
||||
void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
|
||||
uint8_t msglen[8];
|
||||
uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
|
||||
if (lo < (unsigned) S->buflen) hi++;
|
||||
U32TO8(msglen + 0, hi);
|
||||
U32TO8(msglen + 4, lo);
|
||||
|
||||
if (S->buflen == 440) { /* one padding byte */
|
||||
S->t[0] -= 8;
|
||||
blake256_update(S, &pa, 8);
|
||||
} else {
|
||||
if (S->buflen < 440) { /* enough space to fill the block */
|
||||
if (S->buflen == 0) S->nullt = 1;
|
||||
S->t[0] -= 440 - S->buflen;
|
||||
blake256_update(S, padding, 440 - S->buflen);
|
||||
} else { /* need 2 compressions */
|
||||
S->t[0] -= 512 - S->buflen;
|
||||
blake256_update(S, padding, 512 - S->buflen);
|
||||
S->t[0] -= 440;
|
||||
blake256_update(S, padding + 1, 440);
|
||||
S->nullt = 1;
|
||||
}
|
||||
blake256_update(S, &pb, 8);
|
||||
S->t[0] -= 8;
|
||||
}
|
||||
S->t[0] -= 64;
|
||||
blake256_update(S, msglen, 64);
|
||||
|
||||
U32TO8(digest + 0, S->h[0]);
|
||||
U32TO8(digest + 4, S->h[1]);
|
||||
U32TO8(digest + 8, S->h[2]);
|
||||
U32TO8(digest + 12, S->h[3]);
|
||||
U32TO8(digest + 16, S->h[4]);
|
||||
U32TO8(digest + 20, S->h[5]);
|
||||
U32TO8(digest + 24, S->h[6]);
|
||||
U32TO8(digest + 28, S->h[7]);
|
||||
}
|
||||
|
||||
void blake256_final(state *S, uint8_t *digest) {
|
||||
blake256_final_h(S, digest, 0x81, 0x01);
|
||||
}
|
||||
|
||||
void blake224_final(state *S, uint8_t *digest) {
|
||||
blake256_final_h(S, digest, 0x80, 0x00);
|
||||
}
|
||||
|
||||
// inlen = number of bytes
|
||||
void blake256_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
|
||||
state S;
|
||||
blake256_init(&S);
|
||||
blake256_update(&S, in, inlen * 8);
|
||||
blake256_final(&S, out);
|
||||
}
|
||||
|
||||
// inlen = number of bytes
|
||||
void blake224_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
|
||||
state S;
|
||||
blake224_init(&S);
|
||||
blake224_update(&S, in, inlen * 8);
|
||||
blake224_final(&S, out);
|
||||
}
|
||||
|
||||
// keylen = number of bytes
|
||||
void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
|
||||
const uint8_t *key = _key;
|
||||
uint8_t keyhash[32];
|
||||
uint8_t pad[64];
|
||||
uint64_t i;
|
||||
|
||||
if (keylen > 64) {
|
||||
blake256_hash(keyhash, key, keylen);
|
||||
key = keyhash;
|
||||
keylen = 32;
|
||||
}
|
||||
|
||||
blake256_init(&S->inner);
|
||||
memset(pad, 0x36, 64);
|
||||
for (i = 0; i < keylen; ++i) {
|
||||
pad[i] ^= key[i];
|
||||
}
|
||||
blake256_update(&S->inner, pad, 512);
|
||||
|
||||
blake256_init(&S->outer);
|
||||
memset(pad, 0x5c, 64);
|
||||
for (i = 0; i < keylen; ++i) {
|
||||
pad[i] ^= key[i];
|
||||
}
|
||||
blake256_update(&S->outer, pad, 512);
|
||||
|
||||
memset(keyhash, 0, 32);
|
||||
}
|
||||
|
||||
// keylen = number of bytes
|
||||
void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
|
||||
const uint8_t *key = _key;
|
||||
uint8_t keyhash[32];
|
||||
uint8_t pad[64];
|
||||
uint64_t i;
|
||||
|
||||
if (keylen > 64) {
|
||||
blake256_hash(keyhash, key, keylen);
|
||||
key = keyhash;
|
||||
keylen = 28;
|
||||
}
|
||||
|
||||
blake224_init(&S->inner);
|
||||
memset(pad, 0x36, 64);
|
||||
for (i = 0; i < keylen; ++i) {
|
||||
pad[i] ^= key[i];
|
||||
}
|
||||
blake224_update(&S->inner, pad, 512);
|
||||
|
||||
blake224_init(&S->outer);
|
||||
memset(pad, 0x5c, 64);
|
||||
for (i = 0; i < keylen; ++i) {
|
||||
pad[i] ^= key[i];
|
||||
}
|
||||
blake224_update(&S->outer, pad, 512);
|
||||
|
||||
memset(keyhash, 0, 32);
|
||||
}
|
||||
|
||||
// datalen = number of bits
|
||||
void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
|
||||
// update the inner state
|
||||
blake256_update(&S->inner, data, datalen);
|
||||
}
|
||||
|
||||
// datalen = number of bits
|
||||
void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
|
||||
// update the inner state
|
||||
blake224_update(&S->inner, data, datalen);
|
||||
}
|
||||
|
||||
void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
|
||||
uint8_t ihash[32];
|
||||
blake256_final(&S->inner, ihash);
|
||||
blake256_update(&S->outer, ihash, 256);
|
||||
blake256_final(&S->outer, digest);
|
||||
memset(ihash, 0, 32);
|
||||
}
|
||||
|
||||
void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
|
||||
uint8_t ihash[32];
|
||||
blake224_final(&S->inner, ihash);
|
||||
blake224_update(&S->outer, ihash, 224);
|
||||
blake224_final(&S->outer, digest);
|
||||
memset(ihash, 0, 32);
|
||||
}
|
||||
|
||||
// keylen = number of bytes; inlen = number of bytes
|
||||
void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
|
||||
hmac_state S;
|
||||
hmac_blake256_init(&S, key, keylen);
|
||||
hmac_blake256_update(&S, in, inlen * 8);
|
||||
hmac_blake256_final(&S, out);
|
||||
}
|
||||
|
||||
// keylen = number of bytes; inlen = number of bytes
|
||||
void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
|
||||
hmac_state S;
|
||||
hmac_blake224_init(&S, key, keylen);
|
||||
hmac_blake224_update(&S, in, inlen * 8);
|
||||
hmac_blake224_final(&S, out);
|
||||
}
|
||||
43
src/crypto/cn/c_blake256.h
Normal file
43
src/crypto/cn/c_blake256.h
Normal file
@@ -0,0 +1,43 @@
|
||||
#ifndef _BLAKE256_H_
|
||||
#define _BLAKE256_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint32_t h[8], s[4], t[2];
|
||||
int buflen, nullt;
|
||||
uint8_t buf[64];
|
||||
} state;
|
||||
|
||||
typedef struct {
|
||||
state inner;
|
||||
state outer;
|
||||
} hmac_state;
|
||||
|
||||
void blake256_init(state *);
|
||||
void blake224_init(state *);
|
||||
|
||||
void blake256_update(state *, const uint8_t *, uint64_t);
|
||||
void blake224_update(state *, const uint8_t *, uint64_t);
|
||||
|
||||
void blake256_final(state *, uint8_t *);
|
||||
void blake224_final(state *, uint8_t *);
|
||||
|
||||
void blake256_hash(uint8_t *, const uint8_t *, uint64_t);
|
||||
void blake224_hash(uint8_t *, const uint8_t *, uint64_t);
|
||||
|
||||
/* HMAC functions: */
|
||||
|
||||
void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
|
||||
void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
|
||||
|
||||
void hmac_blake256_update(hmac_state *, const uint8_t *, uint64_t);
|
||||
void hmac_blake224_update(hmac_state *, const uint8_t *, uint64_t);
|
||||
|
||||
void hmac_blake256_final(hmac_state *, uint8_t *);
|
||||
void hmac_blake224_final(hmac_state *, uint8_t *);
|
||||
|
||||
void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
|
||||
void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
|
||||
|
||||
#endif /* _BLAKE256_H_ */
|
||||
360
src/crypto/cn/c_groestl.c
Normal file
360
src/crypto/cn/c_groestl.c
Normal file
@@ -0,0 +1,360 @@
|
||||
/* hash.c April 2012
|
||||
* Groestl ANSI C code optimised for 32-bit machines
|
||||
* Author: Thomas Krinninger
|
||||
*
|
||||
* This work is based on the implementation of
|
||||
* Soeren S. Thomsen and Krystian Matusiewicz
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#include "c_groestl.h"
|
||||
#include "groestl_tables.h"
|
||||
|
||||
#define P_TYPE 0
|
||||
#define Q_TYPE 1
|
||||
|
||||
const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}};
|
||||
|
||||
const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
|
||||
|
||||
|
||||
#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
|
||||
v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
|
||||
v1 = temp_var;}
|
||||
|
||||
|
||||
#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \
|
||||
tu = T[2*(uint32_t)x[4*c0+0]]; \
|
||||
tl = T[2*(uint32_t)x[4*c0+0]+1]; \
|
||||
tv1 = T[2*(uint32_t)x[4*c1+1]]; \
|
||||
tv2 = T[2*(uint32_t)x[4*c1+1]+1]; \
|
||||
ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \
|
||||
tu ^= tv1; \
|
||||
tl ^= tv2; \
|
||||
tv1 = T[2*(uint32_t)x[4*c2+2]]; \
|
||||
tv2 = T[2*(uint32_t)x[4*c2+2]+1]; \
|
||||
ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \
|
||||
tu ^= tv1; \
|
||||
tl ^= tv2; \
|
||||
tv1 = T[2*(uint32_t)x[4*c3+3]]; \
|
||||
tv2 = T[2*(uint32_t)x[4*c3+3]+1]; \
|
||||
ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \
|
||||
tu ^= tv1; \
|
||||
tl ^= tv2; \
|
||||
tl ^= T[2*(uint32_t)x[4*c4+0]]; \
|
||||
tu ^= T[2*(uint32_t)x[4*c4+0]+1]; \
|
||||
tv1 = T[2*(uint32_t)x[4*c5+1]]; \
|
||||
tv2 = T[2*(uint32_t)x[4*c5+1]+1]; \
|
||||
ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \
|
||||
tl ^= tv1; \
|
||||
tu ^= tv2; \
|
||||
tv1 = T[2*(uint32_t)x[4*c6+2]]; \
|
||||
tv2 = T[2*(uint32_t)x[4*c6+2]+1]; \
|
||||
ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \
|
||||
tl ^= tv1; \
|
||||
tu ^= tv2; \
|
||||
tv1 = T[2*(uint32_t)x[4*c7+3]]; \
|
||||
tv2 = T[2*(uint32_t)x[4*c7+3]+1]; \
|
||||
ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \
|
||||
tl ^= tv1; \
|
||||
tu ^= tv2; \
|
||||
y[i] = tu; \
|
||||
y[i+1] = tl;
|
||||
|
||||
|
||||
/* compute one round of P (short variants) */
|
||||
static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) {
|
||||
uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
|
||||
uint32_t* x32 = (uint32_t*)x;
|
||||
x32[ 0] ^= 0x00000000^r;
|
||||
x32[ 2] ^= 0x00000010^r;
|
||||
x32[ 4] ^= 0x00000020^r;
|
||||
x32[ 6] ^= 0x00000030^r;
|
||||
x32[ 8] ^= 0x00000040^r;
|
||||
x32[10] ^= 0x00000050^r;
|
||||
x32[12] ^= 0x00000060^r;
|
||||
x32[14] ^= 0x00000070^r;
|
||||
COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
}
|
||||
|
||||
/* compute one round of Q (short variants) */
|
||||
static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) {
|
||||
uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
|
||||
uint32_t* x32 = (uint32_t*)x;
|
||||
x32[ 0] = ~x32[ 0];
|
||||
x32[ 1] ^= 0xffffffff^r;
|
||||
x32[ 2] = ~x32[ 2];
|
||||
x32[ 3] ^= 0xefffffff^r;
|
||||
x32[ 4] = ~x32[ 4];
|
||||
x32[ 5] ^= 0xdfffffff^r;
|
||||
x32[ 6] = ~x32[ 6];
|
||||
x32[ 7] ^= 0xcfffffff^r;
|
||||
x32[ 8] = ~x32[ 8];
|
||||
x32[ 9] ^= 0xbfffffff^r;
|
||||
x32[10] = ~x32[10];
|
||||
x32[11] ^= 0xafffffff^r;
|
||||
x32[12] = ~x32[12];
|
||||
x32[13] ^= 0x9fffffff^r;
|
||||
x32[14] = ~x32[14];
|
||||
x32[15] ^= 0x8fffffff^r;
|
||||
COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
|
||||
}
|
||||
|
||||
/* compute compression function (short variants) */
|
||||
static void F512(uint32_t *h, const uint32_t *m) {
|
||||
int i;
|
||||
uint32_t Ptmp[2*COLS512];
|
||||
uint32_t Qtmp[2*COLS512];
|
||||
uint32_t y[2*COLS512];
|
||||
uint32_t z[2*COLS512];
|
||||
|
||||
for (i = 0; i < 2*COLS512; i++) {
|
||||
z[i] = m[i];
|
||||
Ptmp[i] = h[i]^m[i];
|
||||
}
|
||||
|
||||
/* compute Q(m) */
|
||||
RND512Q((uint8_t*)z, y, 0x00000000);
|
||||
RND512Q((uint8_t*)y, z, 0x01000000);
|
||||
RND512Q((uint8_t*)z, y, 0x02000000);
|
||||
RND512Q((uint8_t*)y, z, 0x03000000);
|
||||
RND512Q((uint8_t*)z, y, 0x04000000);
|
||||
RND512Q((uint8_t*)y, z, 0x05000000);
|
||||
RND512Q((uint8_t*)z, y, 0x06000000);
|
||||
RND512Q((uint8_t*)y, z, 0x07000000);
|
||||
RND512Q((uint8_t*)z, y, 0x08000000);
|
||||
RND512Q((uint8_t*)y, Qtmp, 0x09000000);
|
||||
|
||||
/* compute P(h+m) */
|
||||
RND512P((uint8_t*)Ptmp, y, 0x00000000);
|
||||
RND512P((uint8_t*)y, z, 0x00000001);
|
||||
RND512P((uint8_t*)z, y, 0x00000002);
|
||||
RND512P((uint8_t*)y, z, 0x00000003);
|
||||
RND512P((uint8_t*)z, y, 0x00000004);
|
||||
RND512P((uint8_t*)y, z, 0x00000005);
|
||||
RND512P((uint8_t*)z, y, 0x00000006);
|
||||
RND512P((uint8_t*)y, z, 0x00000007);
|
||||
RND512P((uint8_t*)z, y, 0x00000008);
|
||||
RND512P((uint8_t*)y, Ptmp, 0x00000009);
|
||||
|
||||
/* compute P(h+m) + Q(m) + h */
|
||||
for (i = 0; i < 2*COLS512; i++) {
|
||||
h[i] ^= Ptmp[i]^Qtmp[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* digest up to msglen bytes of input (full blocks only) */
|
||||
static void Transform(groestlHashState *ctx,
|
||||
const uint8_t *input,
|
||||
int msglen) {
|
||||
|
||||
/* digest message, one block at a time */
|
||||
for (; msglen >= SIZE512;
|
||||
msglen -= SIZE512, input += SIZE512) {
|
||||
F512(ctx->chaining,(uint32_t*)input);
|
||||
|
||||
/* increment block counter */
|
||||
ctx->block_counter1++;
|
||||
if (ctx->block_counter1 == 0) ctx->block_counter2++;
|
||||
}
|
||||
}
|
||||
|
||||
/* given state h, do h <- P(h)+h */
|
||||
static void OutputTransformation(groestlHashState *ctx) {
|
||||
int j;
|
||||
uint32_t temp[2*COLS512];
|
||||
uint32_t y[2*COLS512];
|
||||
uint32_t z[2*COLS512];
|
||||
|
||||
|
||||
|
||||
for (j = 0; j < 2*COLS512; j++) {
|
||||
temp[j] = ctx->chaining[j];
|
||||
}
|
||||
RND512P((uint8_t*)temp, y, 0x00000000);
|
||||
RND512P((uint8_t*)y, z, 0x00000001);
|
||||
RND512P((uint8_t*)z, y, 0x00000002);
|
||||
RND512P((uint8_t*)y, z, 0x00000003);
|
||||
RND512P((uint8_t*)z, y, 0x00000004);
|
||||
RND512P((uint8_t*)y, z, 0x00000005);
|
||||
RND512P((uint8_t*)z, y, 0x00000006);
|
||||
RND512P((uint8_t*)y, z, 0x00000007);
|
||||
RND512P((uint8_t*)z, y, 0x00000008);
|
||||
RND512P((uint8_t*)y, temp, 0x00000009);
|
||||
for (j = 0; j < 2*COLS512; j++) {
|
||||
ctx->chaining[j] ^= temp[j];
|
||||
}
|
||||
}
|
||||
|
||||
/* initialise context */
|
||||
static void Init(groestlHashState* ctx) {
|
||||
int i = 0;
|
||||
/* allocate memory for state and data buffer */
|
||||
|
||||
for(;i<(SIZE512/sizeof(uint32_t));i++)
|
||||
{
|
||||
ctx->chaining[i] = 0;
|
||||
}
|
||||
|
||||
/* set initial value */
|
||||
ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN);
|
||||
|
||||
/* set other variables */
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->block_counter1 = 0;
|
||||
ctx->block_counter2 = 0;
|
||||
ctx->bits_in_last_byte = 0;
|
||||
}
|
||||
|
||||
/* update state with databitlen bits of input */
|
||||
static void Update(groestlHashState* ctx,
|
||||
const BitSequence* input,
|
||||
DataLength databitlen) {
|
||||
int index = 0;
|
||||
int msglen = (int)(databitlen/8);
|
||||
int rem = (int)(databitlen%8);
|
||||
|
||||
/* if the buffer contains data that has not yet been digested, first
|
||||
add data to buffer until full */
|
||||
if (ctx->buf_ptr) {
|
||||
while (ctx->buf_ptr < SIZE512 && index < msglen) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
|
||||
}
|
||||
if (ctx->buf_ptr < SIZE512) {
|
||||
/* buffer still not full, return */
|
||||
if (rem) {
|
||||
ctx->bits_in_last_byte = rem;
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* digest buffer */
|
||||
ctx->buf_ptr = 0;
|
||||
Transform(ctx, ctx->buffer, SIZE512);
|
||||
}
|
||||
|
||||
/* digest bulk of message */
|
||||
Transform(ctx, input+index, msglen-index);
|
||||
index += ((msglen-index)/SIZE512)*SIZE512;
|
||||
|
||||
/* store remaining data in buffer */
|
||||
while (index < msglen) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
|
||||
}
|
||||
|
||||
/* if non-integral number of bytes have been supplied, store
|
||||
remaining bits in last byte, together with information about
|
||||
number of bits */
|
||||
if (rem) {
|
||||
ctx->bits_in_last_byte = rem;
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
|
||||
}
|
||||
}
|
||||
|
||||
#define BILB ctx->bits_in_last_byte
|
||||
|
||||
/* finalise: process remaining data (including padding), perform
|
||||
output transformation, and write hash result to 'output' */
|
||||
static void Final(groestlHashState* ctx,
|
||||
BitSequence* output) {
|
||||
int i, j = 0, hashbytelen = HASH_BIT_LEN/8;
|
||||
uint8_t *s = (BitSequence*)ctx->chaining;
|
||||
|
||||
/* pad with '1'-bit and first few '0'-bits */
|
||||
if (BILB) {
|
||||
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
|
||||
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
|
||||
BILB = 0;
|
||||
}
|
||||
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
|
||||
|
||||
/* pad with '0'-bits */
|
||||
if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
|
||||
/* padding requires two blocks */
|
||||
while (ctx->buf_ptr < SIZE512) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = 0;
|
||||
}
|
||||
/* digest first padding block */
|
||||
Transform(ctx, ctx->buffer, SIZE512);
|
||||
ctx->buf_ptr = 0;
|
||||
}
|
||||
while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = 0;
|
||||
}
|
||||
|
||||
/* length padding */
|
||||
ctx->block_counter1++;
|
||||
if (ctx->block_counter1 == 0) ctx->block_counter2++;
|
||||
ctx->buf_ptr = SIZE512;
|
||||
|
||||
while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) {
|
||||
ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
|
||||
ctx->block_counter1 >>= 8;
|
||||
}
|
||||
while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
|
||||
ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
|
||||
ctx->block_counter2 >>= 8;
|
||||
}
|
||||
/* digest final padding block */
|
||||
Transform(ctx, ctx->buffer, SIZE512);
|
||||
/* perform output transformation */
|
||||
OutputTransformation(ctx);
|
||||
|
||||
/* store hash result in output */
|
||||
for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) {
|
||||
output[j] = s[i];
|
||||
}
|
||||
|
||||
/* zeroise relevant variables and deallocate memory */
|
||||
for (i = 0; i < COLS512; i++) {
|
||||
ctx->chaining[i] = 0;
|
||||
}
|
||||
for (i = 0; i < SIZE512; i++) {
|
||||
ctx->buffer[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* hash bit sequence */
|
||||
void groestl(const BitSequence* data,
|
||||
DataLength databitlen,
|
||||
BitSequence* hashval) {
|
||||
|
||||
groestlHashState context;
|
||||
|
||||
/* initialise */
|
||||
Init(&context);
|
||||
|
||||
|
||||
/* process message */
|
||||
Update(&context, data, databitlen);
|
||||
|
||||
/* finalise */
|
||||
Final(&context, hashval);
|
||||
}
|
||||
/*
|
||||
static int crypto_hash(unsigned char *out,
|
||||
const unsigned char *in,
|
||||
unsigned long long len)
|
||||
{
|
||||
groestl(in, 8*len, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*/
|
||||
60
src/crypto/cn/c_groestl.h
Normal file
60
src/crypto/cn/c_groestl.h
Normal file
@@ -0,0 +1,60 @@
|
||||
#ifndef __hash_h
|
||||
#define __hash_h
|
||||
/*
|
||||
#include "crypto_uint8.h"
|
||||
#include "crypto_uint32.h"
|
||||
#include "crypto_uint64.h"
|
||||
#include "crypto_hash.h"
|
||||
|
||||
typedef crypto_uint8 uint8_t;
|
||||
typedef crypto_uint32 uint32_t;
|
||||
typedef crypto_uint64 uint64_t;
|
||||
*/
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS 8
|
||||
#define LENGTHFIELDLEN ROWS
|
||||
#define COLS512 8
|
||||
|
||||
#define SIZE512 (ROWS*COLS512)
|
||||
|
||||
#define ROUNDS512 10
|
||||
#define HASH_BIT_LEN 256
|
||||
|
||||
#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff))
|
||||
|
||||
|
||||
#define li_32(h) 0x##h##u
|
||||
#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
|
||||
#define u32BIG(a) \
|
||||
((ROTL32(a,8) & li_32(00FF00FF)) | \
|
||||
(ROTL32(a,24) & li_32(FF00FF00)))
|
||||
|
||||
|
||||
/* NIST API begin */
|
||||
typedef struct {
|
||||
uint32_t chaining[SIZE512/sizeof(uint32_t)]; /* actual state */
|
||||
uint32_t block_counter1,
|
||||
block_counter2; /* message block counter(s) */
|
||||
BitSequence buffer[SIZE512]; /* data buffer */
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
int bits_in_last_byte; /* no. of message bits in last byte of
|
||||
data buffer */
|
||||
} groestlHashState;
|
||||
|
||||
/*void Init(hashState*);
|
||||
void Update(hashState*, const BitSequence*, DataLength);
|
||||
void Final(hashState*, BitSequence*); */
|
||||
void groestl(const BitSequence*, DataLength, BitSequence*);
|
||||
/* NIST API end */
|
||||
|
||||
/*
|
||||
int crypto_hash(unsigned char *out,
|
||||
const unsigned char *in,
|
||||
unsigned long long len);
|
||||
*/
|
||||
|
||||
#endif /* __hash_h */
|
||||
367
src/crypto/cn/c_jh.c
Normal file
367
src/crypto/cn/c_jh.c
Normal file
@@ -0,0 +1,367 @@
|
||||
/*This program gives the 64-bit optimized bitslice implementation of JH using ANSI C
|
||||
|
||||
--------------------------------
|
||||
Performance
|
||||
|
||||
Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
|
||||
Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
|
||||
Speed for long message:
|
||||
1) 45.8 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O2
|
||||
2) 56.8 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -O3
|
||||
|
||||
--------------------------------
|
||||
Last Modified: January 16, 2011
|
||||
*/
|
||||
|
||||
#include "c_jh.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/*typedef unsigned long long uint64;*/
|
||||
typedef uint64_t uint64;
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
|
||||
#else
|
||||
#define DATA_ALIGN16(x) __declspec(align(16)) x
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct {
|
||||
int hashbitlen; /*the message digest size*/
|
||||
unsigned long long databitlen; /*the message size in bits*/
|
||||
unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
|
||||
DATA_ALIGN16(uint64 x[8][2]); /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
|
||||
unsigned char buffer[64]; /*the 512-bit message block to be hashed;*/
|
||||
} hashState;
|
||||
|
||||
|
||||
/*The initial hash value H(0)*/
|
||||
const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
|
||||
const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
|
||||
const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
|
||||
const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
|
||||
|
||||
/*42 round constants, each round constant is 32-byte (256-bit)*/
|
||||
const unsigned char E8_bitslice_roundconstant[42][32]={
|
||||
{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
|
||||
{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
|
||||
{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
|
||||
{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
|
||||
{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
|
||||
{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
|
||||
{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
|
||||
{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
|
||||
{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
|
||||
{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
|
||||
{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
|
||||
{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
|
||||
{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
|
||||
{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
|
||||
{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
|
||||
{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
|
||||
{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
|
||||
{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
|
||||
{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
|
||||
{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
|
||||
{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
|
||||
{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
|
||||
{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
|
||||
{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
|
||||
{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
|
||||
{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
|
||||
{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
|
||||
{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
|
||||
{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
|
||||
{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
|
||||
{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
|
||||
{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
|
||||
{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
|
||||
{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
|
||||
{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
|
||||
{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
|
||||
{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
|
||||
{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
|
||||
{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
|
||||
{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
|
||||
{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
|
||||
{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
|
||||
|
||||
|
||||
static void E8(hashState *state); /*The bijective function E8, in bitslice form*/
|
||||
static void F8(hashState *state); /*The compression function F8 */
|
||||
|
||||
/*The API functions*/
|
||||
static HashReturn Init(hashState *state, int hashbitlen);
|
||||
static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
|
||||
static HashReturn Final(hashState *state, BitSequence *hashval);
|
||||
HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
|
||||
|
||||
/*swapping bit 2i with bit 2i+1 of 64-bit x*/
|
||||
#define SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
|
||||
/*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/
|
||||
#define SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
|
||||
/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/
|
||||
#define SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
|
||||
/*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/
|
||||
#define SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
|
||||
/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/
|
||||
#define SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
|
||||
/*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/
|
||||
#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32));
|
||||
|
||||
/*The MDS transform*/
|
||||
#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
(m4) ^= (m1); \
|
||||
(m5) ^= (m2); \
|
||||
(m6) ^= (m0) ^ (m3); \
|
||||
(m7) ^= (m0); \
|
||||
(m0) ^= (m5); \
|
||||
(m1) ^= (m6); \
|
||||
(m2) ^= (m4) ^ (m7); \
|
||||
(m3) ^= (m4);
|
||||
|
||||
/*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
|
||||
/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/
|
||||
#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \
|
||||
m3 = ~(m3); \
|
||||
m7 = ~(m7); \
|
||||
m0 ^= ((~(m2)) & (cc0)); \
|
||||
m4 ^= ((~(m6)) & (cc1)); \
|
||||
temp0 = (cc0) ^ ((m0) & (m1));\
|
||||
temp1 = (cc1) ^ ((m4) & (m5));\
|
||||
m0 ^= ((m2) & (m3)); \
|
||||
m4 ^= ((m6) & (m7)); \
|
||||
m3 ^= ((~(m1)) & (m2)); \
|
||||
m7 ^= ((~(m5)) & (m6)); \
|
||||
m1 ^= ((m0) & (m2)); \
|
||||
m5 ^= ((m4) & (m6)); \
|
||||
m2 ^= ((m0) & (~(m3))); \
|
||||
m6 ^= ((m4) & (~(m7))); \
|
||||
m0 ^= ((m1) | (m3)); \
|
||||
m4 ^= ((m5) | (m7)); \
|
||||
m3 ^= ((m1) & (m2)); \
|
||||
m7 ^= ((m5) & (m6)); \
|
||||
m1 ^= (temp0 & (m0)); \
|
||||
m5 ^= (temp1 & (m4)); \
|
||||
m2 ^= temp0; \
|
||||
m6 ^= temp1;
|
||||
|
||||
/*The bijective function E8, in bitslice form*/
|
||||
static void E8(hashState *state)
|
||||
{
|
||||
uint64 i,roundnumber,temp0,temp1;
|
||||
|
||||
for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) {
|
||||
/*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]);
|
||||
}
|
||||
|
||||
/*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]);
|
||||
}
|
||||
|
||||
/*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]);
|
||||
}
|
||||
|
||||
/*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]);
|
||||
}
|
||||
|
||||
/*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]);
|
||||
}
|
||||
|
||||
/*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]);
|
||||
}
|
||||
|
||||
/*round 7*roundnumber+6: Sbox and MDS layers*/
|
||||
for (i = 0; i < 2; i++) {
|
||||
SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] );
|
||||
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
|
||||
}
|
||||
/*round 7*roundnumber+6: swapping layer*/
|
||||
for (i = 1; i < 8; i = i+2) {
|
||||
temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*The compression function F8 */
|
||||
static void F8(hashState *state)
|
||||
{
|
||||
uint64 i;
|
||||
|
||||
/*xor the 512-bit message with the fist half of the 1024-bit hash state*/
|
||||
for (i = 0; i < 8; i++) state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
|
||||
|
||||
/*the bijective function E8 */
|
||||
E8(state);
|
||||
|
||||
/*xor the 512-bit message with the second half of the 1024-bit hash state*/
|
||||
for (i = 0; i < 8; i++) state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i];
|
||||
}
|
||||
|
||||
/*before hashing a message, initialize the hash state as H0 */
|
||||
static HashReturn Init(hashState *state, int hashbitlen)
|
||||
{
|
||||
state->databitlen = 0;
|
||||
state->datasize_in_buffer = 0;
|
||||
|
||||
/*initialize the initial hash value of JH*/
|
||||
state->hashbitlen = hashbitlen;
|
||||
|
||||
/*load the intital hash value into state*/
|
||||
switch (hashbitlen)
|
||||
{
|
||||
case 224: memcpy(state->x,JH224_H0,128); break;
|
||||
case 256: memcpy(state->x,JH256_H0,128); break;
|
||||
case 384: memcpy(state->x,JH384_H0,128); break;
|
||||
case 512: memcpy(state->x,JH512_H0,128); break;
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
/*hash each 512-bit message block, except the last partial block*/
|
||||
static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
|
||||
{
|
||||
DataLength index; /*the starting address of the data to be compressed*/
|
||||
|
||||
state->databitlen += databitlen;
|
||||
index = 0;
|
||||
|
||||
/*if there is remaining data in the buffer, fill it to a full message block first*/
|
||||
/*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
|
||||
|
||||
/*There is data in the buffer, but the incoming data is insufficient for a full block*/
|
||||
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) {
|
||||
if ( (databitlen & 7) == 0 ) {
|
||||
memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
|
||||
}
|
||||
else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
|
||||
state->datasize_in_buffer += databitlen;
|
||||
databitlen = 0;
|
||||
}
|
||||
|
||||
/*There is data in the buffer, and the incoming data is sufficient for a full block*/
|
||||
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) {
|
||||
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
|
||||
index = 64-(state->datasize_in_buffer >> 3);
|
||||
databitlen = databitlen - (512 - state->datasize_in_buffer);
|
||||
F8(state);
|
||||
state->datasize_in_buffer = 0;
|
||||
}
|
||||
|
||||
/*hash the remaining full message blocks*/
|
||||
for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
|
||||
memcpy(state->buffer, data+index, 64);
|
||||
F8(state);
|
||||
}
|
||||
|
||||
/*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
|
||||
if ( databitlen > 0) {
|
||||
if ((databitlen & 7) == 0)
|
||||
memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
|
||||
else
|
||||
memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
|
||||
state->datasize_in_buffer = databitlen;
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
}
|
||||
|
||||
/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
|
||||
static HashReturn Final(hashState *state, BitSequence *hashval)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if ( (state->databitlen & 0x1ff) == 0 ) {
|
||||
/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
|
||||
memset(state->buffer, 0, 64);
|
||||
state->buffer[0] = 0x80;
|
||||
state->buffer[63] = state->databitlen & 0xff;
|
||||
state->buffer[62] = (state->databitlen >> 8) & 0xff;
|
||||
state->buffer[61] = (state->databitlen >> 16) & 0xff;
|
||||
state->buffer[60] = (state->databitlen >> 24) & 0xff;
|
||||
state->buffer[59] = (state->databitlen >> 32) & 0xff;
|
||||
state->buffer[58] = (state->databitlen >> 40) & 0xff;
|
||||
state->buffer[57] = (state->databitlen >> 48) & 0xff;
|
||||
state->buffer[56] = (state->databitlen >> 56) & 0xff;
|
||||
F8(state);
|
||||
}
|
||||
else {
|
||||
/*set the rest of the bytes in the buffer to 0*/
|
||||
if ( (state->datasize_in_buffer & 7) == 0)
|
||||
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
|
||||
else
|
||||
for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0;
|
||||
|
||||
/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
|
||||
state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
|
||||
|
||||
F8(state);
|
||||
memset(state->buffer, 0, 64);
|
||||
state->buffer[63] = state->databitlen & 0xff;
|
||||
state->buffer[62] = (state->databitlen >> 8) & 0xff;
|
||||
state->buffer[61] = (state->databitlen >> 16) & 0xff;
|
||||
state->buffer[60] = (state->databitlen >> 24) & 0xff;
|
||||
state->buffer[59] = (state->databitlen >> 32) & 0xff;
|
||||
state->buffer[58] = (state->databitlen >> 40) & 0xff;
|
||||
state->buffer[57] = (state->databitlen >> 48) & 0xff;
|
||||
state->buffer[56] = (state->databitlen >> 56) & 0xff;
|
||||
F8(state);
|
||||
}
|
||||
|
||||
/*truncating the final hash value to generate the message digest*/
|
||||
switch(state->hashbitlen) {
|
||||
case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28); break;
|
||||
case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32); break;
|
||||
case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48); break;
|
||||
case 512: memcpy(hashval,(unsigned char*)state->x+64,64); break;
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
}
|
||||
|
||||
/* hash a message,
|
||||
three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
|
||||
one output: message digest (hashval)
|
||||
*/
|
||||
HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
hashState state;
|
||||
|
||||
if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) {
|
||||
Init(&state, hashbitlen);
|
||||
Update(&state, data, databitlen);
|
||||
Final(&state, hashval);
|
||||
return SUCCESS;
|
||||
}
|
||||
else
|
||||
return(BAD_HASHLEN);
|
||||
}
|
||||
19
src/crypto/cn/c_jh.h
Normal file
19
src/crypto/cn/c_jh.h
Normal file
@@ -0,0 +1,19 @@
|
||||
/*This program gives the 64-bit optimized bitslice implementation of JH using ANSI C
|
||||
|
||||
--------------------------------
|
||||
Performance
|
||||
|
||||
Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
|
||||
Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
|
||||
Speed for long message:
|
||||
1) 45.8 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O2
|
||||
2) 56.8 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -O3
|
||||
|
||||
--------------------------------
|
||||
Last Modified: January 16, 2011
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
|
||||
701
src/crypto/cn/c_skein.c
Normal file
701
src/crypto/cn/c_skein.c
Normal file
@@ -0,0 +1,701 @@
|
||||
/***********************************************************************
|
||||
**
|
||||
** Implementation of the Skein hash function.
|
||||
**
|
||||
** Source code author: Doug Whiting, 2008.
|
||||
**
|
||||
** This algorithm and source code is released to the public domain.
|
||||
**
|
||||
************************************************************************/
|
||||
|
||||
#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
|
||||
|
||||
#include <stddef.h> /* get size_t definition */
|
||||
#include <string.h> /* get the memcpy/memset functions */
|
||||
#include "c_skein.h" /* get the Skein API definitions */
|
||||
|
||||
#ifndef SKEIN_512_NIST_MAX_HASHBITS
|
||||
#define SKEIN_512_NIST_MAX_HASHBITS (512)
|
||||
#endif
|
||||
|
||||
#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */
|
||||
|
||||
#define SKEIN_512_STATE_WORDS ( 8)
|
||||
#define SKEIN_MAX_STATE_WORDS (16)
|
||||
|
||||
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
|
||||
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
|
||||
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
|
||||
|
||||
#define SKEIN_RND_SPECIAL (1000u)
|
||||
#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL+0u)
|
||||
#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL+1u)
|
||||
#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL+2u)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
size_t hashBitLen; /* size of hash result, in bits */
|
||||
size_t bCnt; /* current byte count in buffer b[] */
|
||||
u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
|
||||
} Skein_Ctxt_Hdr_t;
|
||||
|
||||
typedef struct /* 512-bit Skein hash context structure */
|
||||
{
|
||||
Skein_Ctxt_Hdr_t h; /* common header context variables */
|
||||
u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */
|
||||
u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
|
||||
} Skein_512_Ctxt_t;
|
||||
|
||||
/* Skein APIs for (incremental) "straight hashing" */
|
||||
static int Skein_512_Init (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
|
||||
static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
|
||||
static int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
|
||||
|
||||
#ifndef SKEIN_TREE_HASH
|
||||
#define SKEIN_TREE_HASH (1)
|
||||
#endif
|
||||
|
||||
/*****************************************************************
|
||||
** "Internal" Skein definitions
|
||||
** -- not needed for sequential hashing API, but will be
|
||||
** helpful for other uses of Skein (e.g., tree hash mode).
|
||||
** -- included here so that they can be shared between
|
||||
** reference and optimized code.
|
||||
******************************************************************/
|
||||
|
||||
/* tweak word T[1]: bit field starting positions */
|
||||
#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */
|
||||
|
||||
#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */
|
||||
#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */
|
||||
#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */
|
||||
#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */
|
||||
#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */
|
||||
|
||||
/* tweak word T[1]: flag bit definition(s) */
|
||||
#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST)
|
||||
#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL)
|
||||
#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t) 1 ) << SKEIN_T1_POS_BIT_PAD)
|
||||
|
||||
/* tweak word T[1]: tree level bit field mask */
|
||||
#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
|
||||
#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
|
||||
|
||||
/* tweak word T[1]: block type field */
|
||||
#define SKEIN_BLK_TYPE_KEY ( 0) /* key, for MAC and KDF */
|
||||
#define SKEIN_BLK_TYPE_CFG ( 4) /* configuration block */
|
||||
#define SKEIN_BLK_TYPE_PERS ( 8) /* personalization string */
|
||||
#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */
|
||||
#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */
|
||||
#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */
|
||||
#define SKEIN_BLK_TYPE_MSG (48) /* message processing */
|
||||
#define SKEIN_BLK_TYPE_OUT (63) /* output stage */
|
||||
#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
|
||||
|
||||
#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
|
||||
#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */
|
||||
#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */
|
||||
#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
|
||||
#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */
|
||||
#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */
|
||||
#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
|
||||
#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */
|
||||
#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */
|
||||
#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
|
||||
|
||||
#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
|
||||
#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
|
||||
|
||||
#define SKEIN_VERSION (1)
|
||||
|
||||
#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */
|
||||
#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/
|
||||
#endif
|
||||
|
||||
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32))
|
||||
#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
|
||||
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
|
||||
|
||||
#define SKEIN_CFG_STR_LEN (4*8)
|
||||
|
||||
/* bit field definitions in config block treeInfo word */
|
||||
#define SKEIN_CFG_TREE_LEAF_SIZE_POS ( 0)
|
||||
#define SKEIN_CFG_TREE_NODE_SIZE_POS ( 8)
|
||||
#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16)
|
||||
|
||||
#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
|
||||
#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
|
||||
#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
|
||||
|
||||
#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl) \
|
||||
( (((u64b_t)(leaf )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \
|
||||
(((u64b_t)(node )) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \
|
||||
(((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
|
||||
|
||||
#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
|
||||
|
||||
/*
|
||||
** Skein macros for getting/setting tweak words, etc.
|
||||
** These are useful for partial input bytes, hash tree init/update, etc.
|
||||
**/
|
||||
#define Skein_Get_Tweak(ctxPtr,TWK_NUM) ((ctxPtr)->h.T[TWK_NUM])
|
||||
#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
|
||||
|
||||
#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr,0)
|
||||
#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr,1)
|
||||
#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
|
||||
#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
|
||||
|
||||
/* set both tweak words at once */
|
||||
#define Skein_Set_T0_T1(ctxPtr,T0,T1) \
|
||||
{ \
|
||||
Skein_Set_T0(ctxPtr,(T0)); \
|
||||
Skein_Set_T1(ctxPtr,(T1)); \
|
||||
}
|
||||
|
||||
#define Skein_Set_Type(ctxPtr,BLK_TYPE) \
|
||||
Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
|
||||
|
||||
/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
|
||||
#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \
|
||||
{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
|
||||
|
||||
#define Skein_Clear_First_Flag(hdr) { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; }
|
||||
#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; }
|
||||
|
||||
#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
|
||||
|
||||
/*****************************************************************
|
||||
** "Internal" Skein definitions for debugging and error checking
|
||||
******************************************************************/
|
||||
#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
|
||||
#define Skein_Show_Round(bits,ctx,r,X)
|
||||
#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
|
||||
#define Skein_Show_Final(bits,ctx,cnt,outPtr)
|
||||
#define Skein_Show_Key(bits,ctx,key,keyBytes)
|
||||
|
||||
|
||||
#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */
|
||||
#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
|
||||
#define Skein_assert(x)
|
||||
#elif defined(SKEIN_ASSERT)
|
||||
#include <assert.h>
|
||||
#define Skein_Assert(x,retCode) assert(x)
|
||||
#define Skein_assert(x) assert(x)
|
||||
#else
|
||||
#include <assert.h>
|
||||
#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */
|
||||
#define Skein_assert(x) assert(x) /* internal error */
|
||||
#endif
|
||||
|
||||
/*****************************************************************
|
||||
** Skein block function constants (shared across Ref and Opt code)
|
||||
******************************************************************/
|
||||
enum
|
||||
{
|
||||
/* Skein_512 round rotation constants */
|
||||
R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
|
||||
R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
|
||||
R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
|
||||
R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
|
||||
R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
|
||||
R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
|
||||
R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
|
||||
R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
|
||||
};
|
||||
|
||||
#ifndef SKEIN_ROUNDS
|
||||
#define SKEIN_512_ROUNDS_TOTAL (72)
|
||||
#else /* allow command-line define in range 8*(5..14) */
|
||||
#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
***************** Pre-computed Skein IVs *******************
|
||||
**
|
||||
** NOTE: these values are not "magic" constants, but
|
||||
** are generated using the Threefish block function.
|
||||
** They are pre-computed here only for speed; i.e., to
|
||||
** avoid the need for a Threefish call during Init().
|
||||
**
|
||||
** The IV for any fixed hash length may be pre-computed.
|
||||
** Only the most common values are included here.
|
||||
**
|
||||
************************************************************
|
||||
**/
|
||||
|
||||
#define MK_64 SKEIN_MK_64
|
||||
|
||||
/* blkSize = 512 bits. hashSize = 256 bits */
|
||||
const u64b_t SKEIN_512_IV_256[] =
|
||||
{
|
||||
MK_64(0xCCD044A1,0x2FDB3E13),
|
||||
MK_64(0xE8359030,0x1A79A9EB),
|
||||
MK_64(0x55AEA061,0x4F816E6F),
|
||||
MK_64(0x2A2767A4,0xAE9B94DB),
|
||||
MK_64(0xEC06025E,0x74DD7683),
|
||||
MK_64(0xE7A436CD,0xC4746251),
|
||||
MK_64(0xC36FBAF9,0x393AD185),
|
||||
MK_64(0x3EEDBA18,0x33EDFC13)
|
||||
};
|
||||
|
||||
#ifndef SKEIN_USE_ASM
|
||||
#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
|
||||
#endif
|
||||
|
||||
#ifndef SKEIN_LOOP
|
||||
#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
|
||||
#endif
|
||||
|
||||
#define BLK_BITS (WCNT*64) /* some useful definitions for code here */
|
||||
#define KW_TWK_BASE (0)
|
||||
#define KW_KEY_BASE (3)
|
||||
#define ks (kw + KW_KEY_BASE)
|
||||
#define ts (kw + KW_TWK_BASE)
|
||||
|
||||
#ifdef SKEIN_DEBUG
|
||||
#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
|
||||
#else
|
||||
#define DebugSaveTweak(ctx)
|
||||
#endif
|
||||
|
||||
/***************************** Skein_512 ******************************/
|
||||
#if !(SKEIN_USE_ASM & 512)
|
||||
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
|
||||
{ /* do it in C */
|
||||
enum
|
||||
{
|
||||
WCNT = SKEIN_512_STATE_WORDS
|
||||
};
|
||||
#undef RCNT
|
||||
#define RCNT (SKEIN_512_ROUNDS_TOTAL/8)
|
||||
|
||||
#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
|
||||
#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
|
||||
#else
|
||||
#define SKEIN_UNROLL_512 (0)
|
||||
#endif
|
||||
|
||||
#if SKEIN_UNROLL_512
|
||||
#if (RCNT % SKEIN_UNROLL_512)
|
||||
#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
|
||||
#endif
|
||||
size_t r;
|
||||
u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
|
||||
#else
|
||||
u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
|
||||
#endif
|
||||
u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */
|
||||
u64b_t w [WCNT]; /* local copy of input block */
|
||||
#ifdef SKEIN_DEBUG
|
||||
const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */
|
||||
Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3;
|
||||
Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7;
|
||||
#endif
|
||||
|
||||
Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
|
||||
ts[0] = ctx->h.T[0];
|
||||
ts[1] = ctx->h.T[1];
|
||||
do {
|
||||
/* this implementation only supports 2**64 input bytes (no carry out here) */
|
||||
ts[0] += byteCntAdd; /* update processed length */
|
||||
|
||||
/* precompute the key schedule for this block */
|
||||
ks[0] = ctx->X[0];
|
||||
ks[1] = ctx->X[1];
|
||||
ks[2] = ctx->X[2];
|
||||
ks[3] = ctx->X[3];
|
||||
ks[4] = ctx->X[4];
|
||||
ks[5] = ctx->X[5];
|
||||
ks[6] = ctx->X[6];
|
||||
ks[7] = ctx->X[7];
|
||||
ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
|
||||
ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
|
||||
|
||||
ts[2] = ts[0] ^ ts[1];
|
||||
|
||||
Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
|
||||
DebugSaveTweak(ctx);
|
||||
Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
|
||||
|
||||
X0 = w[0] + ks[0]; /* do the first full key injection */
|
||||
X1 = w[1] + ks[1];
|
||||
X2 = w[2] + ks[2];
|
||||
X3 = w[3] + ks[3];
|
||||
X4 = w[4] + ks[4];
|
||||
X5 = w[5] + ks[5] + ts[0];
|
||||
X6 = w[6] + ks[6] + ts[1];
|
||||
X7 = w[7] + ks[7];
|
||||
|
||||
blkPtr += SKEIN_512_BLOCK_BYTES;
|
||||
|
||||
Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
|
||||
/* run the rounds */
|
||||
#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
|
||||
X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
|
||||
X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
|
||||
X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
|
||||
X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
|
||||
|
||||
#if SKEIN_UNROLL_512 == 0
|
||||
#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \
|
||||
Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
|
||||
Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
|
||||
|
||||
#define I512(R) \
|
||||
X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \
|
||||
X1 += ks[((R)+2) % 9]; \
|
||||
X2 += ks[((R)+3) % 9]; \
|
||||
X3 += ks[((R)+4) % 9]; \
|
||||
X4 += ks[((R)+5) % 9]; \
|
||||
X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
|
||||
X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
|
||||
X7 += ks[((R)+8) % 9] + (R)+1; \
|
||||
Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
|
||||
#else /* looping version */
|
||||
#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
|
||||
Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
|
||||
Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
|
||||
|
||||
#define I512(R) \
|
||||
X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
|
||||
X1 += ks[r+(R)+1]; \
|
||||
X2 += ks[r+(R)+2]; \
|
||||
X3 += ks[r+(R)+3]; \
|
||||
X4 += ks[r+(R)+4]; \
|
||||
X5 += ks[r+(R)+5] + ts[r+(R)+0]; \
|
||||
X6 += ks[r+(R)+6] + ts[r+(R)+1]; \
|
||||
X7 += ks[r+(R)+7] + r+(R) ; \
|
||||
ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \
|
||||
ts[r + (R)+2] = ts[r+(R)-1]; \
|
||||
Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
|
||||
|
||||
for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */
|
||||
#endif /* end of looped code definitions */
|
||||
{
|
||||
#define R512_8_rounds(R) /* do 8 full rounds */ \
|
||||
R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
|
||||
R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
|
||||
R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
|
||||
R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
|
||||
I512(2*(R)); \
|
||||
R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
|
||||
R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
|
||||
R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
|
||||
R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
|
||||
I512(2*(R)+1); /* and key injection */
|
||||
|
||||
R512_8_rounds( 0);
|
||||
|
||||
#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
|
||||
|
||||
#if R512_Unroll_R( 1)
|
||||
R512_8_rounds( 1);
|
||||
#endif
|
||||
#if R512_Unroll_R( 2)
|
||||
R512_8_rounds( 2);
|
||||
#endif
|
||||
#if R512_Unroll_R( 3)
|
||||
R512_8_rounds( 3);
|
||||
#endif
|
||||
#if R512_Unroll_R( 4)
|
||||
R512_8_rounds( 4);
|
||||
#endif
|
||||
#if R512_Unroll_R( 5)
|
||||
R512_8_rounds( 5);
|
||||
#endif
|
||||
#if R512_Unroll_R( 6)
|
||||
R512_8_rounds( 6);
|
||||
#endif
|
||||
#if R512_Unroll_R( 7)
|
||||
R512_8_rounds( 7);
|
||||
#endif
|
||||
#if R512_Unroll_R( 8)
|
||||
R512_8_rounds( 8);
|
||||
#endif
|
||||
#if R512_Unroll_R( 9)
|
||||
R512_8_rounds( 9);
|
||||
#endif
|
||||
#if R512_Unroll_R(10)
|
||||
R512_8_rounds(10);
|
||||
#endif
|
||||
#if R512_Unroll_R(11)
|
||||
R512_8_rounds(11);
|
||||
#endif
|
||||
#if R512_Unroll_R(12)
|
||||
R512_8_rounds(12);
|
||||
#endif
|
||||
#if R512_Unroll_R(13)
|
||||
R512_8_rounds(13);
|
||||
#endif
|
||||
#if R512_Unroll_R(14)
|
||||
R512_8_rounds(14);
|
||||
#endif
|
||||
#if (SKEIN_UNROLL_512 > 14)
|
||||
#error "need more unrolling in Skein_512_Process_Block"
|
||||
#endif
|
||||
}
|
||||
|
||||
/* do the final "feedforward" xor, update context chaining vars */
|
||||
ctx->X[0] = X0 ^ w[0];
|
||||
ctx->X[1] = X1 ^ w[1];
|
||||
ctx->X[2] = X2 ^ w[2];
|
||||
ctx->X[3] = X3 ^ w[3];
|
||||
ctx->X[4] = X4 ^ w[4];
|
||||
ctx->X[5] = X5 ^ w[5];
|
||||
ctx->X[6] = X6 ^ w[6];
|
||||
ctx->X[7] = X7 ^ w[7];
|
||||
Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
|
||||
|
||||
ts[1] &= ~SKEIN_T1_FLAG_FIRST;
|
||||
}
|
||||
while (--blkCnt);
|
||||
ctx->h.T[0] = ts[0];
|
||||
ctx->h.T[1] = ts[1];
|
||||
}
|
||||
#endif
|
||||
|
||||
/*****************************************************************/
|
||||
/* 512-bit Skein */
|
||||
/*****************************************************************/
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* init the context for a straight hashing operation */
|
||||
static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
|
||||
{
|
||||
union
|
||||
{
|
||||
u08b_t b[SKEIN_512_STATE_BYTES];
|
||||
u64b_t w[SKEIN_512_STATE_WORDS];
|
||||
} cfg; /* config block */
|
||||
|
||||
Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
|
||||
ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
|
||||
|
||||
switch (hashBitLen)
|
||||
{ /* use pre-computed values, where available */
|
||||
#ifndef SKEIN_NO_PRECOMP
|
||||
case 256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X)); break;
|
||||
#endif
|
||||
default:
|
||||
/* here if there is no precomputed IV value available */
|
||||
/* build/process the config block, type == CONFIG (could be precomputed) */
|
||||
Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
|
||||
|
||||
cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
|
||||
cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
|
||||
cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
|
||||
memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
|
||||
|
||||
/* compute the initial chaining values from config block */
|
||||
memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */
|
||||
Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
|
||||
break;
|
||||
}
|
||||
|
||||
/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
|
||||
/* Set up to process the data message portion of the hash (default) */
|
||||
Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */
|
||||
|
||||
return SKEIN_SUCCESS;
|
||||
}
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* process the input bytes */
|
||||
static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
|
||||
{
|
||||
size_t n;
|
||||
|
||||
Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
|
||||
|
||||
/* process full blocks, if any */
|
||||
if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
|
||||
{
|
||||
if (ctx->h.bCnt) /* finish up any buffered message data */
|
||||
{
|
||||
n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
|
||||
if (n)
|
||||
{
|
||||
Skein_assert(n < msgByteCnt); /* check on our logic here */
|
||||
memcpy(&ctx->b[ctx->h.bCnt],msg,n);
|
||||
msgByteCnt -= n;
|
||||
msg += n;
|
||||
ctx->h.bCnt += n;
|
||||
}
|
||||
Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
|
||||
Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
|
||||
ctx->h.bCnt = 0;
|
||||
}
|
||||
/* now process any remaining full blocks, directly from input message data */
|
||||
if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
|
||||
{
|
||||
n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */
|
||||
Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
|
||||
msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
|
||||
msg += n * SKEIN_512_BLOCK_BYTES;
|
||||
}
|
||||
Skein_assert(ctx->h.bCnt == 0);
|
||||
}
|
||||
|
||||
/* copy any remaining source message data bytes into b[] */
|
||||
if (msgByteCnt)
|
||||
{
|
||||
Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
|
||||
memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
|
||||
ctx->h.bCnt += msgByteCnt;
|
||||
}
|
||||
|
||||
return SKEIN_SUCCESS;
|
||||
}
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* finalize the hash computation and output the result */
|
||||
static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
|
||||
{
|
||||
size_t i,n,byteCnt;
|
||||
u64b_t X[SKEIN_512_STATE_WORDS];
|
||||
Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
|
||||
|
||||
ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
|
||||
if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */
|
||||
memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
|
||||
|
||||
Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
|
||||
|
||||
/* now output the result */
|
||||
byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
|
||||
|
||||
/* run Threefish in "counter mode" to generate output */
|
||||
memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
|
||||
memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
|
||||
for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
|
||||
{
|
||||
((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
|
||||
Skein_Start_New_Type(ctx,OUT_FINAL);
|
||||
Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
|
||||
n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */
|
||||
if (n >= SKEIN_512_BLOCK_BYTES)
|
||||
n = SKEIN_512_BLOCK_BYTES;
|
||||
Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
|
||||
Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
|
||||
memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
|
||||
}
|
||||
return SKEIN_SUCCESS;
|
||||
}
|
||||
|
||||
#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
|
||||
static size_t Skein_512_API_CodeSize(void)
|
||||
{
|
||||
return ((u08b_t *) Skein_512_API_CodeSize) -
|
||||
((u08b_t *) Skein_512_Init);
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint_t statebits; /* 256, 512, or 1024 */
|
||||
union
|
||||
{
|
||||
Skein_Ctxt_Hdr_t h; /* common header "overlay" */
|
||||
Skein_512_Ctxt_t ctx_512;
|
||||
} u;
|
||||
}
|
||||
hashState;
|
||||
|
||||
/* "incremental" hashing API */
|
||||
static SkeinHashReturn Init (hashState *state, int hashbitlen);
|
||||
static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen);
|
||||
static SkeinHashReturn Final (hashState *state, SkeinBitSequence *hashval);
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* select the context size and init the context */
|
||||
static SkeinHashReturn Init(hashState *state, int hashbitlen)
|
||||
{
|
||||
state->statebits = 64*SKEIN_512_STATE_WORDS;
|
||||
return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
|
||||
}
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* process data to be hashed */
|
||||
static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen)
|
||||
{
|
||||
/* only the final Update() call is allowed do partial bytes, else assert an error */
|
||||
Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
|
||||
|
||||
Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL);
|
||||
if ((databitlen & 7) == 0) /* partial bytes? */
|
||||
{
|
||||
return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
|
||||
}
|
||||
else
|
||||
{ /* handle partial final byte */
|
||||
size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */
|
||||
u08b_t b,mask;
|
||||
|
||||
mask = (u08b_t) (1u << (7 - (databitlen & 7))); /* partial byte bit mask */
|
||||
b = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte */
|
||||
|
||||
Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte */
|
||||
Skein_512_Update(&state->u.ctx_512,&b , 1 ); /* process the (masked) partial byte */
|
||||
Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */
|
||||
|
||||
return SKEIN_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* finalize hash computation and output the result (hashbitlen bits) */
|
||||
static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval)
|
||||
{
|
||||
Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
|
||||
return Skein_512_Final(&state->u.ctx_512,hashval);
|
||||
}
|
||||
|
||||
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
|
||||
/* all-in-one hash function */
|
||||
SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */
|
||||
SkeinDataLength databitlen,SkeinBitSequence *hashval)
|
||||
{
|
||||
hashState state;
|
||||
SkeinHashReturn r = Init(&state,hashbitlen);
|
||||
if (r == SKEIN_SUCCESS)
|
||||
{ /* these calls do not fail when called properly */
|
||||
r = Update(&state,data,databitlen);
|
||||
Final(&state,hashval);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void xmr_skein(const SkeinBitSequence *data, SkeinBitSequence *hashval){
|
||||
#define XMR_HASHBITLEN 256
|
||||
#define XMR_DATABITLEN 1600
|
||||
|
||||
// Init
|
||||
hashState state;
|
||||
state.statebits = 64*SKEIN_512_STATE_WORDS;
|
||||
|
||||
// Skein_512_Init(&state.u.ctx_512, (size_t)XMR_HASHBITLEN);
|
||||
state.u.ctx_512.h.hashBitLen = XMR_HASHBITLEN;
|
||||
memcpy(state.u.ctx_512.X,SKEIN_512_IV_256,sizeof(state.u.ctx_512.X));
|
||||
Skein_512_Ctxt_t* ctx = &(state.u.ctx_512);
|
||||
Skein_Start_New_Type(ctx,MSG);
|
||||
|
||||
// Update
|
||||
if ((XMR_DATABITLEN & 7) == 0){ /* partial bytes? */
|
||||
Skein_512_Update(&state.u.ctx_512,data,XMR_DATABITLEN >> 3);
|
||||
}else{ /* handle partial final byte */
|
||||
size_t bCnt = (XMR_DATABITLEN >> 3) + 1; /* number of bytes to handle (nonzero here!) */
|
||||
u08b_t b,mask;
|
||||
|
||||
mask = (u08b_t) (1u << (7 - (XMR_DATABITLEN & 7))); /* partial byte bit mask */
|
||||
b = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte */
|
||||
|
||||
Skein_512_Update(&state.u.ctx_512,data,bCnt-1); /* process all but the final byte */
|
||||
Skein_512_Update(&state.u.ctx_512,&b , 1 ); /* process the (masked) partial byte */
|
||||
Skein_Set_Bit_Pad_Flag(state.u.h); /* set tweak flag for the final call */
|
||||
}
|
||||
|
||||
// Finalize
|
||||
Skein_512_Final(&state.u.ctx_512, hashval);
|
||||
}
|
||||
49
src/crypto/cn/c_skein.h
Normal file
49
src/crypto/cn/c_skein.h
Normal file
@@ -0,0 +1,49 @@
|
||||
#ifndef _SKEIN_H_
|
||||
#define _SKEIN_H_ 1
|
||||
/**************************************************************************
|
||||
**
|
||||
** Interface declarations and internal definitions for Skein hashing.
|
||||
**
|
||||
** Source code author: Doug Whiting, 2008.
|
||||
**
|
||||
** This algorithm and source code is released to the public domain.
|
||||
**
|
||||
***************************************************************************
|
||||
**
|
||||
** The following compile-time switches may be defined to control some
|
||||
** tradeoffs between speed, code size, error checking, and security.
|
||||
**
|
||||
** The "default" note explains what happens when the switch is not defined.
|
||||
**
|
||||
** SKEIN_DEBUG -- make callouts from inside Skein code
|
||||
** to examine/display intermediate values.
|
||||
** [default: no callouts (no overhead)]
|
||||
**
|
||||
** SKEIN_ERR_CHECK -- how error checking is handled inside Skein
|
||||
** code. If not defined, most error checking
|
||||
** is disabled (for performance). Otherwise,
|
||||
** the switch value is interpreted as:
|
||||
** 0: use assert() to flag errors
|
||||
** 1: return SKEIN_FAIL to flag errors
|
||||
**
|
||||
***************************************************************************/
|
||||
#include "skein_port.h" /* get platform-specific definitions */
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SKEIN_SUCCESS = 0, /* return codes from Skein calls */
|
||||
SKEIN_FAIL = 1,
|
||||
SKEIN_BAD_HASHLEN = 2
|
||||
}
|
||||
SkeinHashReturn;
|
||||
|
||||
typedef size_t SkeinDataLength; /* bit count type */
|
||||
typedef u08b_t SkeinBitSequence; /* bit stream type */
|
||||
|
||||
/* "all-in-one" call */
|
||||
SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data,
|
||||
SkeinDataLength databitlen, SkeinBitSequence *hashval);
|
||||
|
||||
void xmr_skein(const SkeinBitSequence *data, SkeinBitSequence *hashval);
|
||||
|
||||
#endif /* ifndef _SKEIN_H_ */
|
||||
240
src/crypto/cn/gpu/cn_gpu_arm.cpp
Normal file
240
src/crypto/cn/gpu/cn_gpu_arm.cpp
Normal file
@@ -0,0 +1,240 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#include "crypto/CryptoNight_constants.h"
|
||||
|
||||
|
||||
inline void vandq_f32(float32x4_t &v, uint32_t v2)
|
||||
{
|
||||
uint32x4_t vc = vdupq_n_u32(v2);
|
||||
v = (float32x4_t)vandq_u32((uint32x4_t)v, vc);
|
||||
}
|
||||
|
||||
|
||||
inline void vorq_f32(float32x4_t &v, uint32_t v2)
|
||||
{
|
||||
uint32x4_t vc = vdupq_n_u32(v2);
|
||||
v = (float32x4_t)vorrq_u32((uint32x4_t)v, vc);
|
||||
}
|
||||
|
||||
|
||||
template <size_t v>
|
||||
inline void vrot_si32(int32x4_t &r)
|
||||
{
|
||||
r = (int32x4_t)vextq_s8((int8x16_t)r, (int8x16_t)r, v);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void vrot_si32<0>(int32x4_t &r)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
inline uint32_t vheor_s32(const int32x4_t &v)
|
||||
{
|
||||
int32x4_t v0 = veorq_s32(v, vrev64q_s32(v));
|
||||
int32x2_t vf = veor_s32(vget_high_s32(v0), vget_low_s32(v0));
|
||||
return (uint32_t)vget_lane_s32(vf, 0);
|
||||
}
|
||||
|
||||
|
||||
inline void prep_dv(int32_t *idx, int32x4_t &v, float32x4_t &n)
|
||||
{
|
||||
v = vld1q_s32(idx);
|
||||
n = vcvtq_f32_s32(v);
|
||||
}
|
||||
|
||||
|
||||
inline void sub_round(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, const float32x4_t &rnd_c, float32x4_t &n, float32x4_t &d, float32x4_t &c)
|
||||
{
|
||||
float32x4_t ln1 = vaddq_f32(n1, c);
|
||||
float32x4_t nn = vmulq_f32(n0, c);
|
||||
nn = vmulq_f32(ln1, vmulq_f32(nn, nn));
|
||||
vandq_f32(nn, 0xFEFFFFFF);
|
||||
vorq_f32(nn, 0x00800000);
|
||||
n = vaddq_f32(n, nn);
|
||||
|
||||
float32x4_t ln3 = vsubq_f32(n3, c);
|
||||
float32x4_t dd = vmulq_f32(n2, c);
|
||||
dd = vmulq_f32(ln3, vmulq_f32(dd, dd));
|
||||
vandq_f32(dd, 0xFEFFFFFF);
|
||||
vorq_f32(dd, 0x00800000);
|
||||
d = vaddq_f32(d, dd);
|
||||
|
||||
//Constant feedback
|
||||
c = vaddq_f32(c, rnd_c);
|
||||
c = vaddq_f32(c, vdupq_n_f32(0.734375f));
|
||||
float32x4_t r = vaddq_f32(nn, dd);
|
||||
vandq_f32(r, 0x807FFFFF);
|
||||
vorq_f32(r, 0x40000000);
|
||||
c = vaddq_f32(c, r);
|
||||
}
|
||||
|
||||
|
||||
inline void round_compute(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, const float32x4_t &rnd_c, float32x4_t &c, float32x4_t &r)
|
||||
{
|
||||
float32x4_t n = vdupq_n_f32(0.0f), d = vdupq_n_f32(0.0f);
|
||||
|
||||
sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
|
||||
sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
|
||||
sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
|
||||
sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
|
||||
sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
|
||||
sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
|
||||
sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
|
||||
sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
|
||||
|
||||
// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
|
||||
vandq_f32(d, 0xFF7FFFFF);
|
||||
vorq_f32(d, 0x40000000);
|
||||
r = vaddq_f32(r, vdivq_f32(n, d));
|
||||
}
|
||||
|
||||
|
||||
// 112×4 = 448
|
||||
template <bool add>
|
||||
inline int32x4_t single_compute(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, float cnt, const float32x4_t &rnd_c, float32x4_t &sum)
|
||||
{
|
||||
float32x4_t c = vdupq_n_f32(cnt);
|
||||
float32x4_t r = vdupq_n_f32(0.0f);
|
||||
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
|
||||
// do a quick fmod by setting exp to 2
|
||||
vandq_f32(r, 0x807FFFFF);
|
||||
vorq_f32(r, 0x40000000);
|
||||
|
||||
if (add) {
|
||||
sum = vaddq_f32(sum, r);
|
||||
} else {
|
||||
sum = r;
|
||||
}
|
||||
|
||||
const float32x4_t cc2 = vdupq_n_f32(536870880.0f);
|
||||
r = vmulq_f32(r, cc2); // 35
|
||||
return vcvtq_s32_f32(r);
|
||||
}
|
||||
|
||||
|
||||
template<size_t rot>
|
||||
inline void single_compute_wrap(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, float cnt, const float32x4_t &rnd_c, float32x4_t &sum, int32x4_t &out)
|
||||
{
|
||||
int32x4_t r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
|
||||
vrot_si32<rot>(r);
|
||||
out = veorq_s32(out, r);
|
||||
}
|
||||
|
||||
|
||||
template<uint32_t MASK>
|
||||
inline int32_t *scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<int32_t *>(lpad + (idx & MASK) + n * 16); }
|
||||
|
||||
|
||||
template<size_t ITER, uint32_t MASK>
|
||||
void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad)
|
||||
{
|
||||
uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
|
||||
int32_t *idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
|
||||
int32_t *idx1 = scratchpad_ptr<MASK>(lpad, s, 1);
|
||||
int32_t *idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
|
||||
int32_t *idx3 = scratchpad_ptr<MASK>(lpad, s, 3);
|
||||
float32x4_t sum0 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (size_t i = 0; i < ITER; i++) {
|
||||
float32x4_t n0, n1, n2, n3;
|
||||
int32x4_t v0, v1, v2, v3;
|
||||
float32x4_t suma, sumb, sum1, sum2, sum3;
|
||||
|
||||
prep_dv(idx0, v0, n0);
|
||||
prep_dv(idx1, v1, n1);
|
||||
prep_dv(idx2, v2, n2);
|
||||
prep_dv(idx3, v3, n3);
|
||||
float32x4_t rc = sum0;
|
||||
|
||||
int32x4_t out, out2;
|
||||
out = vdupq_n_s32(0);
|
||||
single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
|
||||
single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
|
||||
single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
|
||||
sum0 = vaddq_f32(suma, sumb);
|
||||
vst1q_s32(idx0, veorq_s32(v0, out));
|
||||
out2 = out;
|
||||
|
||||
out = vdupq_n_s32(0);
|
||||
single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
|
||||
single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
|
||||
single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
|
||||
sum1 = vaddq_f32(suma, sumb);
|
||||
vst1q_s32(idx1, veorq_s32(v1, out));
|
||||
out2 = veorq_s32(out2, out);
|
||||
|
||||
out = vdupq_n_s32(0);
|
||||
single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
|
||||
single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
|
||||
single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
|
||||
sum2 = vaddq_f32(suma, sumb);
|
||||
vst1q_s32(idx2, veorq_s32(v2, out));
|
||||
out2 = veorq_s32(out2, out);
|
||||
|
||||
out = vdupq_n_s32(0);
|
||||
single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
|
||||
single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
|
||||
single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
|
||||
sum3 = vaddq_f32(suma, sumb);
|
||||
vst1q_s32(idx3, veorq_s32(v3, out));
|
||||
out2 = veorq_s32(out2, out);
|
||||
|
||||
sum0 = vaddq_f32(sum0, sum1);
|
||||
sum2 = vaddq_f32(sum2, sum3);
|
||||
sum0 = vaddq_f32(sum0, sum2);
|
||||
|
||||
const float32x4_t cc1 = vdupq_n_f32(16777216.0f);
|
||||
const float32x4_t cc2 = vdupq_n_f32(64.0f);
|
||||
vandq_f32(sum0, 0x7fffffff); // take abs(va) by masking the float sign bit
|
||||
// vs range 0 - 64
|
||||
n0 = vmulq_f32(sum0, cc1);
|
||||
v0 = vcvtq_s32_f32(n0);
|
||||
v0 = veorq_s32(v0, out2);
|
||||
uint32_t n = vheor_s32(v0);
|
||||
|
||||
// vs is now between 0 and 1
|
||||
sum0 = vdivq_f32(sum0, cc2);
|
||||
idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
|
||||
idx1 = scratchpad_ptr<MASK>(lpad, n, 1);
|
||||
idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
|
||||
idx3 = scratchpad_ptr<MASK>(lpad, n, 3);
|
||||
}
|
||||
}
|
||||
|
||||
template void cn_gpu_inner_arm<xmrig::CRYPTONIGHT_GPU_ITER, xmrig::CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
|
||||
209
src/crypto/cn/gpu/cn_gpu_avx.cpp
Normal file
209
src/crypto/cn/gpu/cn_gpu_avx.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "crypto/cn/CryptoNight_constants.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
# include <intrin.h>
|
||||
# define __restrict__ __restrict
|
||||
#endif
|
||||
#ifndef _mm256_bslli_epi128
|
||||
#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
|
||||
#endif
|
||||
#ifndef _mm256_bsrli_epi128
|
||||
#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
|
||||
#endif
|
||||
|
||||
inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
|
||||
{
|
||||
v = _mm256_load_si256(idx);
|
||||
n01 = _mm256_cvtepi32_ps(v);
|
||||
}
|
||||
|
||||
inline __m256 fma_break(const __m256& x)
|
||||
{
|
||||
// Break the dependency chain by setitng the exp to ?????01
|
||||
__m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x);
|
||||
return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx);
|
||||
}
|
||||
|
||||
// 14
|
||||
inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c)
|
||||
{
|
||||
__m256 nn = _mm256_mul_ps(n0, c);
|
||||
nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn));
|
||||
nn = fma_break(nn);
|
||||
n = _mm256_add_ps(n, nn);
|
||||
|
||||
__m256 dd = _mm256_mul_ps(n2, c);
|
||||
dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd));
|
||||
dd = fma_break(dd);
|
||||
d = _mm256_add_ps(d, dd);
|
||||
|
||||
//Constant feedback
|
||||
c = _mm256_add_ps(c, rnd_c);
|
||||
c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f));
|
||||
__m256 r = _mm256_add_ps(nn, dd);
|
||||
r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
|
||||
r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
|
||||
c = _mm256_add_ps(c, r);
|
||||
}
|
||||
|
||||
// 14*8 + 2 = 112
|
||||
inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r)
|
||||
{
|
||||
__m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps();
|
||||
|
||||
sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
|
||||
sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
|
||||
sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
|
||||
sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
|
||||
sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
|
||||
sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
|
||||
sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
|
||||
sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
|
||||
|
||||
// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
|
||||
d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d);
|
||||
d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d);
|
||||
r = _mm256_add_ps(r, _mm256_div_ps(n, d));
|
||||
}
|
||||
|
||||
// 112×4 = 448
|
||||
template <bool add>
|
||||
inline __m256i double_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
|
||||
float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
|
||||
{
|
||||
__m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
|
||||
__m256 r = _mm256_setzero_ps();
|
||||
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
|
||||
// do a quick fmod by setting exp to 2
|
||||
r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
|
||||
r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
|
||||
|
||||
if(add)
|
||||
sum = _mm256_add_ps(sum, r);
|
||||
else
|
||||
sum = r;
|
||||
|
||||
r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35
|
||||
return _mm256_cvttps_epi32(r);
|
||||
}
|
||||
|
||||
template <size_t rot>
|
||||
inline void double_compute_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
|
||||
float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
|
||||
{
|
||||
__m256i r = double_compute<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
|
||||
if(rot != 0)
|
||||
r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot));
|
||||
|
||||
out = _mm256_xor_si256(out, r);
|
||||
}
|
||||
|
||||
template<uint32_t MASK>
|
||||
inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m256i*>(lpad + (idx & MASK) + n*16); }
|
||||
|
||||
template<size_t ITER, uint32_t MASK>
|
||||
void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
|
||||
{
|
||||
uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
|
||||
__m256i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
|
||||
__m256i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
|
||||
__m256 sum0 = _mm256_setzero_ps();
|
||||
|
||||
for(size_t i = 0; i < ITER; i++)
|
||||
{
|
||||
__m256i v01, v23;
|
||||
__m256 suma, sumb, sum1;
|
||||
__m256 rc = sum0;
|
||||
|
||||
__m256 n01, n23;
|
||||
prep_dv_avx(idx0, v01, n01);
|
||||
prep_dv_avx(idx2, v23, n23);
|
||||
|
||||
__m256i out, out2;
|
||||
__m256 n10, n22, n33;
|
||||
n10 = _mm256_permute2f128_ps(n01, n01, 0x01);
|
||||
n22 = _mm256_permute2f128_ps(n23, n23, 0x00);
|
||||
n33 = _mm256_permute2f128_ps(n23, n23, 0x11);
|
||||
|
||||
out = _mm256_setzero_si256();
|
||||
double_compute_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out);
|
||||
double_compute_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out);
|
||||
double_compute_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out);
|
||||
double_compute_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out);
|
||||
_mm256_store_si256(idx0, _mm256_xor_si256(v01, out));
|
||||
sum0 = _mm256_add_ps(suma, sumb);
|
||||
out2 = out;
|
||||
|
||||
__m256 n11, n02, n30;
|
||||
n11 = _mm256_permute2f128_ps(n01, n01, 0x11);
|
||||
n02 = _mm256_permute2f128_ps(n01, n23, 0x20);
|
||||
n30 = _mm256_permute2f128_ps(n01, n23, 0x03);
|
||||
|
||||
out = _mm256_setzero_si256();
|
||||
double_compute_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out);
|
||||
double_compute_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out);
|
||||
double_compute_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out);
|
||||
double_compute_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out);
|
||||
_mm256_store_si256(idx2, _mm256_xor_si256(v23, out));
|
||||
sum1 = _mm256_add_ps(suma, sumb);
|
||||
|
||||
out2 = _mm256_xor_si256(out2, out);
|
||||
out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2);
|
||||
suma = _mm256_permute2f128_ps(sum0, sum1, 0x30);
|
||||
sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21);
|
||||
sum0 = _mm256_add_ps(suma, sumb);
|
||||
sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41));
|
||||
|
||||
// Clear the high 128 bits
|
||||
__m128 sum = _mm256_castps256_ps128(sum0);
|
||||
|
||||
sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit
|
||||
// vs range 0 - 64
|
||||
__m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f)));
|
||||
v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2));
|
||||
__m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
v0 = _mm_xor_si128(v0, v1);
|
||||
v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
|
||||
v0 = _mm_xor_si128(v0, v1);
|
||||
|
||||
// vs is now between 0 and 1
|
||||
sum = _mm_div_ps(sum, _mm_set1_ps(64.0f));
|
||||
sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1);
|
||||
uint32_t n = _mm_cvtsi128_si32(v0);
|
||||
idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
|
||||
idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
|
||||
}
|
||||
}
|
||||
|
||||
template void cn_gpu_inner_avx<xmrig::CRYPTONIGHT_GPU_ITER, xmrig::CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
|
||||
210
src/crypto/cn/gpu/cn_gpu_ssse3.cpp
Normal file
210
src/crypto/cn/gpu/cn_gpu_ssse3.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "crypto/cn/CryptoNight_constants.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
# include <intrin.h>
|
||||
# define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
|
||||
{
|
||||
v = _mm_load_si128(idx);
|
||||
n = _mm_cvtepi32_ps(v);
|
||||
}
|
||||
|
||||
inline __m128 fma_break(__m128 x)
|
||||
{
|
||||
// Break the dependency chain by setitng the exp to ?????01
|
||||
x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x);
|
||||
return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x);
|
||||
}
|
||||
|
||||
// 14
|
||||
inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
|
||||
{
|
||||
n1 = _mm_add_ps(n1, c);
|
||||
__m128 nn = _mm_mul_ps(n0, c);
|
||||
nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
|
||||
nn = fma_break(nn);
|
||||
n = _mm_add_ps(n, nn);
|
||||
|
||||
n3 = _mm_sub_ps(n3, c);
|
||||
__m128 dd = _mm_mul_ps(n2, c);
|
||||
dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
|
||||
dd = fma_break(dd);
|
||||
d = _mm_add_ps(d, dd);
|
||||
|
||||
//Constant feedback
|
||||
c = _mm_add_ps(c, rnd_c);
|
||||
c = _mm_add_ps(c, _mm_set1_ps(0.734375f));
|
||||
__m128 r = _mm_add_ps(nn, dd);
|
||||
r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
|
||||
r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
|
||||
c = _mm_add_ps(c, r);
|
||||
}
|
||||
|
||||
// 14*8 + 2 = 112
|
||||
inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
|
||||
{
|
||||
__m128 n = _mm_setzero_ps(), d = _mm_setzero_ps();
|
||||
|
||||
sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
|
||||
sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
|
||||
sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
|
||||
sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
|
||||
sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
|
||||
sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
|
||||
sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
|
||||
sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
|
||||
|
||||
// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
|
||||
d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d);
|
||||
d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d);
|
||||
r =_mm_add_ps(r, _mm_div_ps(n,d));
|
||||
}
|
||||
|
||||
// 112×4 = 448
|
||||
template<bool add>
|
||||
inline __m128i single_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
|
||||
{
|
||||
__m128 c = _mm_set1_ps(cnt);
|
||||
__m128 r = _mm_setzero_ps();
|
||||
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
round_compute(n0, n1, n2, n3, rnd_c, c, r);
|
||||
|
||||
// do a quick fmod by setting exp to 2
|
||||
r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
|
||||
r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
|
||||
|
||||
if(add)
|
||||
sum = _mm_add_ps(sum, r);
|
||||
else
|
||||
sum = r;
|
||||
|
||||
r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35
|
||||
return _mm_cvttps_epi32(r);
|
||||
}
|
||||
|
||||
template<size_t rot>
|
||||
inline void single_compute_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
|
||||
{
|
||||
__m128i r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
|
||||
if(rot != 0)
|
||||
r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot));
|
||||
out = _mm_xor_si128(out, r);
|
||||
}
|
||||
|
||||
template<uint32_t MASK>
|
||||
inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m128i*>(lpad + (idx & MASK) + n*16); }
|
||||
|
||||
template<size_t ITER, uint32_t MASK>
|
||||
void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
|
||||
{
|
||||
uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
|
||||
__m128i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
|
||||
__m128i* idx1 = scratchpad_ptr<MASK>(lpad, s, 1);
|
||||
__m128i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
|
||||
__m128i* idx3 = scratchpad_ptr<MASK>(lpad, s, 3);
|
||||
__m128 sum0 = _mm_setzero_ps();
|
||||
|
||||
for(size_t i = 0; i < ITER; i++)
|
||||
{
|
||||
__m128 n0, n1, n2, n3;
|
||||
__m128i v0, v1, v2, v3;
|
||||
__m128 suma, sumb, sum1, sum2, sum3;
|
||||
|
||||
prep_dv(idx0, v0, n0);
|
||||
prep_dv(idx1, v1, n1);
|
||||
prep_dv(idx2, v2, n2);
|
||||
prep_dv(idx3, v3, n3);
|
||||
__m128 rc = sum0;
|
||||
|
||||
__m128i out, out2;
|
||||
out = _mm_setzero_si128();
|
||||
single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
|
||||
single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
|
||||
single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
|
||||
sum0 = _mm_add_ps(suma, sumb);
|
||||
_mm_store_si128(idx0, _mm_xor_si128(v0, out));
|
||||
out2 = out;
|
||||
|
||||
out = _mm_setzero_si128();
|
||||
single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
|
||||
single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
|
||||
single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
|
||||
sum1 = _mm_add_ps(suma, sumb);
|
||||
_mm_store_si128(idx1, _mm_xor_si128(v1, out));
|
||||
out2 = _mm_xor_si128(out2, out);
|
||||
|
||||
out = _mm_setzero_si128();
|
||||
single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
|
||||
single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
|
||||
single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
|
||||
sum2 = _mm_add_ps(suma, sumb);
|
||||
_mm_store_si128(idx2, _mm_xor_si128(v2, out));
|
||||
out2 = _mm_xor_si128(out2, out);
|
||||
|
||||
out = _mm_setzero_si128();
|
||||
single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
|
||||
single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
|
||||
single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
|
||||
single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
|
||||
sum3 = _mm_add_ps(suma, sumb);
|
||||
_mm_store_si128(idx3, _mm_xor_si128(v3, out));
|
||||
out2 = _mm_xor_si128(out2, out);
|
||||
sum0 = _mm_add_ps(sum0, sum1);
|
||||
sum2 = _mm_add_ps(sum2, sum3);
|
||||
sum0 = _mm_add_ps(sum0, sum2);
|
||||
|
||||
sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit
|
||||
// vs range 0 - 64
|
||||
n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f));
|
||||
v0 = _mm_cvttps_epi32(n0);
|
||||
v0 = _mm_xor_si128(v0, out2);
|
||||
v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
v0 = _mm_xor_si128(v0, v1);
|
||||
v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
|
||||
v0 = _mm_xor_si128(v0, v1);
|
||||
|
||||
// vs is now between 0 and 1
|
||||
sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f));
|
||||
uint32_t n = _mm_cvtsi128_si32(v0);
|
||||
idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
|
||||
idx1 = scratchpad_ptr<MASK>(lpad, n, 1);
|
||||
idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
|
||||
idx3 = scratchpad_ptr<MASK>(lpad, n, 3);
|
||||
}
|
||||
}
|
||||
|
||||
template void cn_gpu_inner_ssse3<xmrig::CRYPTONIGHT_GPU_ITER, xmrig::CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
|
||||
38
src/crypto/cn/groestl_tables.h
Normal file
38
src/crypto/cn/groestl_tables.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef __tables_h
|
||||
#define __tables_h
|
||||
|
||||
|
||||
const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
|
||||
, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
|
||||
, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
|
||||
, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
|
||||
, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
|
||||
, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
|
||||
, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
|
||||
, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
|
||||
, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
|
||||
, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
|
||||
, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
|
||||
, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
|
||||
, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
|
||||
, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
|
||||
, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
|
||||
, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
|
||||
, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
|
||||
, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
|
||||
, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
|
||||
, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
|
||||
, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
|
||||
, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
|
||||
, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
|
||||
, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
|
||||
, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
|
||||
, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
|
||||
, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
|
||||
, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
|
||||
, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
|
||||
, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
|
||||
, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
|
||||
, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
|
||||
|
||||
#endif /* __tables_h */
|
||||
5
src/crypto/cn/hash.h
Normal file
5
src/crypto/cn/hash.h
Normal file
@@ -0,0 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
|
||||
188
src/crypto/cn/r/CryptonightR_gen.cpp
Normal file
188
src/crypto/cn/r/CryptonightR_gen.cpp
Normal file
@@ -0,0 +1,188 @@
|
||||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstring>
|
||||
#include "crypto/cn/CryptoNight_monero.h"
|
||||
|
||||
typedef void(*void_func)();
|
||||
|
||||
#include "crypto/cn/asm/CryptonightR_template.h"
|
||||
#include "crypto/common/VirtualMemory.h"
|
||||
#include "Mem.h"
|
||||
|
||||
|
||||
static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)())
|
||||
{
|
||||
const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
|
||||
if (size > 0) {
|
||||
memcpy(p, reinterpret_cast<void*>(p1), size);
|
||||
p += size;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, xmrig::Assembly ASM)
|
||||
{
|
||||
uint32_t prev_rot_src = (uint32_t)(-1);
|
||||
|
||||
for (int i = 0;; ++i) {
|
||||
const V4_Instruction inst = code[i];
|
||||
if (inst.opcode == RET) {
|
||||
break;
|
||||
}
|
||||
|
||||
uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
|
||||
uint8_t dst_index = inst.dst_index;
|
||||
uint8_t src_index = inst.src_index;
|
||||
|
||||
const uint32_t a = inst.dst_index;
|
||||
const uint32_t b = inst.src_index;
|
||||
const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
|
||||
|
||||
switch (inst.opcode) {
|
||||
case ROR:
|
||||
case ROL:
|
||||
if (b != prev_rot_src) {
|
||||
prev_rot_src = b;
|
||||
add_code(p, instructions_mov[c], instructions_mov[c + 1]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (a == prev_rot_src) {
|
||||
prev_rot_src = (uint32_t)(-1);
|
||||
}
|
||||
|
||||
void_func begin = instructions[c];
|
||||
|
||||
if ((ASM = xmrig::ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
|
||||
// AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
|
||||
// Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
|
||||
uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
|
||||
|
||||
if (*prefix == 0x49) {
|
||||
*(p++) = 0x41;
|
||||
}
|
||||
|
||||
begin = reinterpret_cast<void_func>(prefix + 1);
|
||||
}
|
||||
|
||||
add_code(p, begin, instructions[c + 1]);
|
||||
|
||||
if (inst.opcode == ADD) {
|
||||
*(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
|
||||
if (is_64_bit) {
|
||||
prev_rot_src = (uint32_t)(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
|
||||
{
|
||||
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
|
||||
uint8_t* p = p0;
|
||||
|
||||
add_code(p, CryptonightWOW_template_part1, CryptonightWOW_template_part2);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightWOW_template_part2, CryptonightWOW_template_part3);
|
||||
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_template_mainloop) - ((const uint8_t*)CryptonightWOW_template_part1)) - (p - p0));
|
||||
add_code(p, CryptonightWOW_template_part3, CryptonightWOW_template_end);
|
||||
|
||||
xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
|
||||
}
|
||||
|
||||
void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
|
||||
{
|
||||
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
|
||||
uint8_t* p = p0;
|
||||
|
||||
add_code(p, CryptonightR_template_part1, CryptonightR_template_part2);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightR_template_part2, CryptonightR_template_part3);
|
||||
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
|
||||
add_code(p, CryptonightR_template_part3, CryptonightR_template_end);
|
||||
|
||||
xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
|
||||
}
|
||||
|
||||
void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
|
||||
{
|
||||
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
|
||||
uint8_t* p = p0;
|
||||
|
||||
add_code(p, CryptonightWOW_template_double_part1, CryptonightWOW_template_double_part2);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightWOW_template_double_part2, CryptonightWOW_template_double_part3);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightWOW_template_double_part3, CryptonightWOW_template_double_part4);
|
||||
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_template_double_mainloop) - ((const uint8_t*)CryptonightWOW_template_double_part1)) - (p - p0));
|
||||
add_code(p, CryptonightWOW_template_double_part4, CryptonightWOW_template_double_end);
|
||||
|
||||
xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
|
||||
}
|
||||
|
||||
void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
|
||||
{
|
||||
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
|
||||
uint8_t* p = p0;
|
||||
|
||||
add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
|
||||
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
|
||||
add_code(p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
|
||||
|
||||
xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
|
||||
}
|
||||
|
||||
void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
|
||||
{
|
||||
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
|
||||
uint8_t* p = p0;
|
||||
|
||||
add_code(p, CryptonightWOW_soft_aes_template_part1, CryptonightWOW_soft_aes_template_part2);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightWOW_soft_aes_template_part2, CryptonightWOW_soft_aes_template_part3);
|
||||
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightWOW_soft_aes_template_part1)) - (p - p0));
|
||||
add_code(p, CryptonightWOW_soft_aes_template_part3, CryptonightWOW_soft_aes_template_end);
|
||||
|
||||
xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
|
||||
}
|
||||
|
||||
void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
|
||||
{
|
||||
uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
|
||||
uint8_t* p = p0;
|
||||
|
||||
add_code(p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
|
||||
add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
|
||||
add_code(p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
|
||||
*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
|
||||
add_code(p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
|
||||
|
||||
xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
|
||||
}
|
||||
448
src/crypto/cn/r/variant4_random_math.h
Normal file
448
src/crypto/cn/r/variant4_random_math.h
Normal file
@@ -0,0 +1,448 @@
|
||||
#ifndef VARIANT4_RANDOM_MATH_H
|
||||
#define VARIANT4_RANDOM_MATH_H
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "crypto/cn/c_blake256.h"
|
||||
}
|
||||
|
||||
enum V4_Settings
|
||||
{
|
||||
// Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
|
||||
TOTAL_LATENCY = 15 * 3,
|
||||
|
||||
// Always generate at least 60 instructions
|
||||
NUM_INSTRUCTIONS_MIN = 60,
|
||||
|
||||
// Never generate more than 70 instructions (final RET instruction doesn't count here)
|
||||
NUM_INSTRUCTIONS_MAX = 70,
|
||||
|
||||
// Available ALUs for MUL
|
||||
// Modern CPUs typically have only 1 ALU which can do multiplications
|
||||
ALU_COUNT_MUL = 1,
|
||||
|
||||
// Total available ALUs
|
||||
// Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
|
||||
ALU_COUNT = 3,
|
||||
};
|
||||
|
||||
enum V4_InstructionList
|
||||
{
|
||||
MUL, // a*b
|
||||
ADD, // a+b + C, C is an unsigned 32-bit constant
|
||||
SUB, // a-b
|
||||
ROR, // rotate right "a" by "b & 31" bits
|
||||
ROL, // rotate left "a" by "b & 31" bits
|
||||
XOR, // a^b
|
||||
RET, // finish execution
|
||||
V4_INSTRUCTION_COUNT = RET,
|
||||
};
|
||||
|
||||
// V4_InstructionDefinition is used to generate code from random data
|
||||
// Every random sequence of bytes is a valid code
|
||||
//
|
||||
// There are 9 registers in total:
|
||||
// - 4 variable registers
|
||||
// - 5 constant registers initialized from loop variables
|
||||
// This is why dst_index is 2 bits
|
||||
enum V4_InstructionDefinition
|
||||
{
|
||||
V4_OPCODE_BITS = 3,
|
||||
V4_DST_INDEX_BITS = 2,
|
||||
V4_SRC_INDEX_BITS = 3,
|
||||
};
|
||||
|
||||
struct V4_Instruction
|
||||
{
|
||||
uint8_t opcode;
|
||||
uint8_t dst_index;
|
||||
uint8_t src_index;
|
||||
uint32_t C;
|
||||
};
|
||||
|
||||
#ifndef FORCEINLINE
|
||||
#ifdef __GNUC__
|
||||
#define FORCEINLINE __attribute__((always_inline)) inline
|
||||
#elif _MSC_VER
|
||||
#define FORCEINLINE __forceinline
|
||||
#else
|
||||
#define FORCEINLINE inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef UNREACHABLE_CODE
|
||||
#ifdef __GNUC__
|
||||
#define UNREACHABLE_CODE __builtin_unreachable()
|
||||
#elif _MSC_VER
|
||||
#define UNREACHABLE_CODE __assume(false)
|
||||
#else
|
||||
#define UNREACHABLE_CODE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
|
||||
// every switch-case will point to the same destination on every iteration of Cryptonight main loop
|
||||
//
|
||||
// This is about as fast as it can get without using low-level machine code generation
|
||||
template<typename v4_reg>
|
||||
static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
|
||||
{
|
||||
enum
|
||||
{
|
||||
REG_BITS = sizeof(v4_reg) * 8,
|
||||
};
|
||||
|
||||
#define V4_EXEC(i) \
|
||||
{ \
|
||||
const struct V4_Instruction* op = code + i; \
|
||||
const v4_reg src = r[op->src_index]; \
|
||||
v4_reg* dst = r + op->dst_index; \
|
||||
switch (op->opcode) \
|
||||
{ \
|
||||
case MUL: \
|
||||
*dst *= src; \
|
||||
break; \
|
||||
case ADD: \
|
||||
*dst += src + op->C; \
|
||||
break; \
|
||||
case SUB: \
|
||||
*dst -= src; \
|
||||
break; \
|
||||
case ROR: \
|
||||
{ \
|
||||
const uint32_t shift = src % REG_BITS; \
|
||||
*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
|
||||
} \
|
||||
break; \
|
||||
case ROL: \
|
||||
{ \
|
||||
const uint32_t shift = src % REG_BITS; \
|
||||
*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
|
||||
} \
|
||||
break; \
|
||||
case XOR: \
|
||||
*dst ^= src; \
|
||||
break; \
|
||||
case RET: \
|
||||
return; \
|
||||
default: \
|
||||
UNREACHABLE_CODE; \
|
||||
break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define V4_EXEC_10(j) \
|
||||
V4_EXEC(j + 0) \
|
||||
V4_EXEC(j + 1) \
|
||||
V4_EXEC(j + 2) \
|
||||
V4_EXEC(j + 3) \
|
||||
V4_EXEC(j + 4) \
|
||||
V4_EXEC(j + 5) \
|
||||
V4_EXEC(j + 6) \
|
||||
V4_EXEC(j + 7) \
|
||||
V4_EXEC(j + 8) \
|
||||
V4_EXEC(j + 9)
|
||||
|
||||
// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
|
||||
// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
|
||||
//
|
||||
// 60 27960
|
||||
// 61 105054
|
||||
// 62 2452759
|
||||
// 63 5115997
|
||||
// 64 1022269
|
||||
// 65 1109635
|
||||
// 66 153145
|
||||
// 67 8550
|
||||
// 68 4529
|
||||
// 69 102
|
||||
|
||||
// Unroll 70 instructions here
|
||||
V4_EXEC_10(0); // instructions 0-9
|
||||
V4_EXEC_10(10); // instructions 10-19
|
||||
V4_EXEC_10(20); // instructions 20-29
|
||||
V4_EXEC_10(30); // instructions 30-39
|
||||
V4_EXEC_10(40); // instructions 40-49
|
||||
V4_EXEC_10(50); // instructions 50-59
|
||||
V4_EXEC_10(60); // instructions 60-69
|
||||
|
||||
#undef V4_EXEC_10
|
||||
#undef V4_EXEC
|
||||
}
|
||||
|
||||
// If we don't have enough data available, generate more
|
||||
static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
|
||||
{
|
||||
if (*data_index + bytes_needed > data_size)
|
||||
{
|
||||
hash_extra_blake(data, data_size, (char*) data);
|
||||
*data_index = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Generates as many random math operations as possible with given latency and ALU restrictions
|
||||
// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
|
||||
template<xmrig::Variant VARIANT>
|
||||
static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
|
||||
{
|
||||
// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
|
||||
// These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
|
||||
//
|
||||
// AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
|
||||
// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
|
||||
// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
|
||||
// Source: https://www.agner.org/optimize/instruction_tables.pdf
|
||||
const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
|
||||
|
||||
// Instruction latencies for theoretical ASIC implementation
|
||||
const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
|
||||
|
||||
// Available ALUs for each instruction
|
||||
const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
|
||||
|
||||
int8_t data[32];
|
||||
memset(data, 0, sizeof(data));
|
||||
uint64_t tmp = SWAP64LE(height);
|
||||
memcpy(data, &tmp, sizeof(uint64_t));
|
||||
if (VARIANT == xmrig::VARIANT_4)
|
||||
{
|
||||
data[20] = -38;
|
||||
}
|
||||
|
||||
// Set data_index past the last byte in data
|
||||
// to trigger full data update with blake hash
|
||||
// before we start using it
|
||||
size_t data_index = sizeof(data);
|
||||
|
||||
int code_size;
|
||||
|
||||
// There is a small chance (1.8%) that register R8 won't be used in the generated program
|
||||
// So we keep track of it and try again if it's not used
|
||||
bool r8_used;
|
||||
do {
|
||||
int latency[9];
|
||||
int asic_latency[9];
|
||||
|
||||
// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
|
||||
// byte 0: current value of the destination register
|
||||
// byte 1: instruction opcode
|
||||
// byte 2: current value of the source register
|
||||
//
|
||||
// Registers R4-R8 are constant and are treated as having the same value because when we do
|
||||
// the same operation twice with two constant source registers, it can be optimized into a single operation
|
||||
uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
|
||||
|
||||
bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
|
||||
bool is_rotation[V4_INSTRUCTION_COUNT];
|
||||
bool rotated[4];
|
||||
int rotate_count = 0;
|
||||
|
||||
memset(latency, 0, sizeof(latency));
|
||||
memset(asic_latency, 0, sizeof(asic_latency));
|
||||
memset(alu_busy, 0, sizeof(alu_busy));
|
||||
memset(is_rotation, 0, sizeof(is_rotation));
|
||||
memset(rotated, 0, sizeof(rotated));
|
||||
is_rotation[ROR] = true;
|
||||
is_rotation[ROL] = true;
|
||||
|
||||
int num_retries = 0;
|
||||
code_size = 0;
|
||||
|
||||
int total_iterations = 0;
|
||||
r8_used = (VARIANT == xmrig::VARIANT_WOW);
|
||||
|
||||
// Generate random code to achieve minimal required latency for our abstract CPU
|
||||
// Try to get this latency for all 4 registers
|
||||
while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
|
||||
{
|
||||
// Fail-safe to guarantee loop termination
|
||||
++total_iterations;
|
||||
if (total_iterations > 256)
|
||||
break;
|
||||
|
||||
check_data(&data_index, 1, data, sizeof(data));
|
||||
|
||||
const uint8_t c = ((uint8_t*)data)[data_index++];
|
||||
|
||||
// MUL = opcodes 0-2
|
||||
// ADD = opcode 3
|
||||
// SUB = opcode 4
|
||||
// ROR/ROL = opcode 5, shift direction is selected randomly
|
||||
// XOR = opcodes 6-7
|
||||
uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
|
||||
if (opcode == 5)
|
||||
{
|
||||
check_data(&data_index, 1, data, sizeof(data));
|
||||
opcode = (data[data_index++] >= 0) ? ROR : ROL;
|
||||
}
|
||||
else if (opcode >= 6)
|
||||
{
|
||||
opcode = XOR;
|
||||
}
|
||||
else
|
||||
{
|
||||
opcode = (opcode <= 2) ? MUL : (opcode - 2);
|
||||
}
|
||||
|
||||
uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
|
||||
uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
|
||||
|
||||
const int a = dst_index;
|
||||
int b = src_index;
|
||||
|
||||
// Don't do ADD/SUB/XOR with the same register
|
||||
if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
|
||||
{
|
||||
// a is always < 4, so we don't need to check bounds here
|
||||
b = (VARIANT == xmrig::VARIANT_WOW) ? (a + 4) : 8;
|
||||
src_index = b;
|
||||
}
|
||||
|
||||
// Don't do rotation with the same destination twice because it's equal to a single rotation
|
||||
if (is_rotation[opcode] && rotated[a])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
|
||||
// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
|
||||
// 2xXOR(a, b) = NOP
|
||||
if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find which ALU is available (and when) for this instruction
|
||||
int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
|
||||
int alu_index = -1;
|
||||
while (next_latency < TOTAL_LATENCY)
|
||||
{
|
||||
for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
|
||||
{
|
||||
if (!alu_busy[next_latency][i])
|
||||
{
|
||||
// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
|
||||
if ((opcode == ADD) && alu_busy[next_latency + 1][i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rotation can only start when previous rotation is finished, so do an additional availability check
|
||||
if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
alu_index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (alu_index >= 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
++next_latency;
|
||||
}
|
||||
|
||||
// Don't generate instructions that leave some register unchanged for more than 7 cycles
|
||||
if (next_latency > latency[a] + 7)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
next_latency += op_latency[opcode];
|
||||
|
||||
if (next_latency <= TOTAL_LATENCY)
|
||||
{
|
||||
if (is_rotation[opcode])
|
||||
{
|
||||
++rotate_count;
|
||||
}
|
||||
|
||||
// Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
|
||||
alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
|
||||
latency[a] = next_latency;
|
||||
|
||||
// ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
|
||||
asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
|
||||
|
||||
rotated[a] = is_rotation[opcode];
|
||||
|
||||
inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
|
||||
|
||||
code[code_size].opcode = opcode;
|
||||
code[code_size].dst_index = dst_index;
|
||||
code[code_size].src_index = src_index;
|
||||
code[code_size].C = 0;
|
||||
|
||||
if (src_index == 8)
|
||||
{
|
||||
r8_used = true;
|
||||
}
|
||||
|
||||
if (opcode == ADD)
|
||||
{
|
||||
// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
|
||||
alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
|
||||
|
||||
// ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
|
||||
check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
|
||||
uint32_t t;
|
||||
memcpy(&t, data + data_index, sizeof(uint32_t));
|
||||
code[code_size].C = SWAP32LE(t);
|
||||
data_index += sizeof(uint32_t);
|
||||
}
|
||||
|
||||
++code_size;
|
||||
if (code_size >= NUM_INSTRUCTIONS_MIN)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
++num_retries;
|
||||
}
|
||||
}
|
||||
|
||||
// ASIC has more execution resources and can extract as much parallelism from the code as possible
|
||||
// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
|
||||
// Get this latency for at least 1 of the 4 registers
|
||||
const int prev_code_size = code_size;
|
||||
while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
|
||||
{
|
||||
int min_idx = 0;
|
||||
int max_idx = 0;
|
||||
for (int i = 1; i < 4; ++i)
|
||||
{
|
||||
if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
|
||||
if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
|
||||
}
|
||||
|
||||
const uint8_t pattern[3] = { ROR, MUL, MUL };
|
||||
const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
|
||||
latency[min_idx] = latency[max_idx] + op_latency[opcode];
|
||||
asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
|
||||
|
||||
code[code_size].opcode = opcode;
|
||||
code[code_size].dst_index = min_idx;
|
||||
code[code_size].src_index = max_idx;
|
||||
code[code_size].C = 0;
|
||||
++code_size;
|
||||
}
|
||||
|
||||
// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
|
||||
// It never does more than 4 iterations for all block heights < 10,000,000
|
||||
} while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
|
||||
|
||||
// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
|
||||
// Add final instruction to stop the interpreter
|
||||
code[code_size].opcode = RET;
|
||||
code[code_size].dst_index = 0;
|
||||
code[code_size].src_index = 0;
|
||||
code[code_size].C = 0;
|
||||
|
||||
return code_size;
|
||||
}
|
||||
|
||||
#endif
|
||||
187
src/crypto/cn/skein_port.h
Normal file
187
src/crypto/cn/skein_port.h
Normal file
@@ -0,0 +1,187 @@
|
||||
#ifndef _SKEIN_PORT_H_
|
||||
#define _SKEIN_PORT_H_
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef RETURN_VALUES
|
||||
# define RETURN_VALUES
|
||||
# if defined( DLL_EXPORT )
|
||||
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
|
||||
# define VOID_RETURN __declspec( dllexport ) void __stdcall
|
||||
# define INT_RETURN __declspec( dllexport ) int __stdcall
|
||||
# elif defined( __GNUC__ )
|
||||
# define VOID_RETURN __declspec( __dllexport__ ) void
|
||||
# define INT_RETURN __declspec( __dllexport__ ) int
|
||||
# else
|
||||
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
|
||||
# endif
|
||||
# elif defined( DLL_IMPORT )
|
||||
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
|
||||
# define VOID_RETURN __declspec( dllimport ) void __stdcall
|
||||
# define INT_RETURN __declspec( dllimport ) int __stdcall
|
||||
# elif defined( __GNUC__ )
|
||||
# define VOID_RETURN __declspec( __dllimport__ ) void
|
||||
# define INT_RETURN __declspec( __dllimport__ ) int
|
||||
# else
|
||||
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
|
||||
# endif
|
||||
# elif defined( __WATCOMC__ )
|
||||
# define VOID_RETURN void __cdecl
|
||||
# define INT_RETURN int __cdecl
|
||||
# else
|
||||
# define VOID_RETURN void
|
||||
# define INT_RETURN int
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* These defines are used to declare buffers in a way that allows
|
||||
faster operations on longer variables to be used. In all these
|
||||
defines 'size' must be a power of 2 and >= 8
|
||||
|
||||
dec_unit_type(size,x) declares a variable 'x' of length
|
||||
'size' bits
|
||||
|
||||
dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize'
|
||||
bytes defined as an array of variables
|
||||
each of 'size' bits (bsize must be a
|
||||
multiple of size / 8)
|
||||
|
||||
ptr_cast(x,size) casts a pointer to a pointer to a
|
||||
varaiable of length 'size' bits
|
||||
*/
|
||||
|
||||
#define ui_type(size) uint##size##_t
|
||||
#define dec_unit_type(size,x) typedef ui_type(size) x
|
||||
#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
|
||||
#define ptr_cast(x,size) ((ui_type(size)*)(x))
|
||||
|
||||
typedef unsigned int uint_t; /* native unsigned integer */
|
||||
typedef uint8_t u08b_t; /* 8-bit unsigned integer */
|
||||
typedef uint64_t u64b_t; /* 64-bit unsigned integer */
|
||||
|
||||
#ifndef RotL_64
|
||||
#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N))))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Skein is "natively" little-endian (unlike SHA-xxx), for optimal
|
||||
* performance on x86 CPUs. The Skein code requires the following
|
||||
* definitions for dealing with endianness:
|
||||
*
|
||||
* SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian
|
||||
* Skein_Put64_LSB_First
|
||||
* Skein_Get64_LSB_First
|
||||
* Skein_Swap64
|
||||
*
|
||||
* If SKEIN_NEED_SWAP is defined at compile time, it is used here
|
||||
* along with the portable versions of Put64/Get64/Swap64, which
|
||||
* are slow in general.
|
||||
*
|
||||
* Otherwise, an "auto-detect" of endianness is attempted below.
|
||||
* If the default handling doesn't work well, the user may insert
|
||||
* platform-specific code instead (e.g., for big-endian CPUs).
|
||||
*
|
||||
*/
|
||||
#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
|
||||
|
||||
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
|
||||
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN && !defined(PLATFORM_BYTE_ORDER)
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN && !defined(PLATFORM_BYTE_ORDER)
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
#endif
|
||||
|
||||
/* special handler for IA64, which may be either endianness (?) */
|
||||
/* here we assume little-endian, but this may need to be changed */
|
||||
#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
|
||||
# define PLATFORM_MUST_ALIGN (1)
|
||||
#ifndef PLATFORM_BYTE_ORDER
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef PLATFORM_MUST_ALIGN
|
||||
# define PLATFORM_MUST_ALIGN (0)
|
||||
#endif
|
||||
|
||||
|
||||
#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
|
||||
/* here for big-endian CPUs */
|
||||
#define SKEIN_NEED_SWAP (1)
|
||||
#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
|
||||
/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
|
||||
#define SKEIN_NEED_SWAP (0)
|
||||
#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */
|
||||
#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
|
||||
#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
|
||||
#endif
|
||||
#else
|
||||
#error "Skein needs endianness setting!"
|
||||
#endif
|
||||
|
||||
#endif /* ifndef SKEIN_NEED_SWAP */
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
* Provide any definitions still needed.
|
||||
******************************************************************
|
||||
*/
|
||||
#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */
|
||||
#if SKEIN_NEED_SWAP
|
||||
#define Skein_Swap64(w64) \
|
||||
( (( ((u64b_t)(w64)) & 0xFF) << 56) | \
|
||||
(((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \
|
||||
(((((u64b_t)(w64)) >>16) & 0xFF) << 40) | \
|
||||
(((((u64b_t)(w64)) >>24) & 0xFF) << 32) | \
|
||||
(((((u64b_t)(w64)) >>32) & 0xFF) << 24) | \
|
||||
(((((u64b_t)(w64)) >>40) & 0xFF) << 16) | \
|
||||
(((((u64b_t)(w64)) >>48) & 0xFF) << 8) | \
|
||||
(((((u64b_t)(w64)) >>56) & 0xFF) ) )
|
||||
#else
|
||||
#define Skein_Swap64(w64) (w64)
|
||||
#endif
|
||||
#endif /* ifndef Skein_Swap64 */
|
||||
|
||||
|
||||
#ifndef Skein_Put64_LSB_First
|
||||
void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
|
||||
#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
|
||||
{ /* this version is fully portable (big-endian or little-endian), but slow */
|
||||
size_t n;
|
||||
|
||||
for (n=0;n<bCnt;n++)
|
||||
dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
|
||||
}
|
||||
#else
|
||||
; /* output only the function prototype */
|
||||
#endif
|
||||
#endif /* ifndef Skein_Put64_LSB_First */
|
||||
|
||||
|
||||
#ifndef Skein_Get64_LSB_First
|
||||
void Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
|
||||
#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
|
||||
{ /* this version is fully portable (big-endian or little-endian), but slow */
|
||||
size_t n;
|
||||
|
||||
for (n=0;n<8*wCnt;n+=8)
|
||||
dst[n/8] = (((u64b_t) src[n ]) ) +
|
||||
(((u64b_t) src[n+1]) << 8) +
|
||||
(((u64b_t) src[n+2]) << 16) +
|
||||
(((u64b_t) src[n+3]) << 24) +
|
||||
(((u64b_t) src[n+4]) << 32) +
|
||||
(((u64b_t) src[n+5]) << 40) +
|
||||
(((u64b_t) src[n+6]) << 48) +
|
||||
(((u64b_t) src[n+7]) << 56) ;
|
||||
}
|
||||
#else
|
||||
; /* output only the function prototype */
|
||||
#endif
|
||||
#endif /* ifndef Skein_Get64_LSB_First */
|
||||
|
||||
#endif /* ifndef _SKEIN_PORT_H_ */
|
||||
146
src/crypto/cn/soft_aes.h
Normal file
146
src/crypto/cn/soft_aes.h
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Additional permission under GNU GPL version 3 section 7
|
||||
*
|
||||
* If you modify this Program, or any covered work, by linking or combining
|
||||
* it with OpenSSL (or a modified version of that library), containing parts
|
||||
* covered by the terms of OpenSSL License and SSLeay License, the licensors
|
||||
* of this Program grant you additional permission to convey the resulting work.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* Parts of this file are originally copyright (c) 2014-2017, The Monero Project
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
# include "crypto/SSE2NEON.h"
|
||||
#elif defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
|
||||
#define saes_data(w) {\
|
||||
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
|
||||
w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
|
||||
w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
|
||||
w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
|
||||
w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
|
||||
w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
|
||||
w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
|
||||
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
|
||||
w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
|
||||
w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
|
||||
w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
|
||||
w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
|
||||
w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
|
||||
w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
|
||||
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
|
||||
w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
|
||||
w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
|
||||
w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
|
||||
w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
|
||||
w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
|
||||
w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
|
||||
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
|
||||
w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
|
||||
w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
|
||||
w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
|
||||
w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
|
||||
w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
|
||||
w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
|
||||
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
|
||||
w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
|
||||
w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
|
||||
w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
|
||||
|
||||
#define SAES_WPOLY 0x011b
|
||||
|
||||
#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
|
||||
((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
|
||||
|
||||
#define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
|
||||
#define saes_f3(x) (saes_f2(x) ^ x)
|
||||
#define saes_h0(x) (x)
|
||||
|
||||
#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p))
|
||||
#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p)
|
||||
#define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p)
|
||||
#define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p))
|
||||
|
||||
alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
|
||||
alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0);
|
||||
|
||||
static inline __m128i soft_aesenc(const uint32_t* in, __m128i key)
|
||||
{
|
||||
const uint32_t x0 = in[0];
|
||||
const uint32_t x1 = in[1];
|
||||
const uint32_t x2 = in[2];
|
||||
const uint32_t x3 = in[3];
|
||||
|
||||
__m128i out = _mm_set_epi32(
|
||||
(saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
|
||||
(saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
|
||||
(saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
|
||||
(saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
|
||||
|
||||
return _mm_xor_si128(out, key);
|
||||
}
|
||||
|
||||
static inline __m128i soft_aesenc(__m128i in, __m128i key)
|
||||
{
|
||||
uint32_t x0, x1, x2, x3;
|
||||
x0 = _mm_cvtsi128_si32(in);
|
||||
x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
|
||||
x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
|
||||
x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
|
||||
|
||||
__m128i out = _mm_set_epi32(
|
||||
(saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
|
||||
(saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
|
||||
(saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
|
||||
(saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
|
||||
|
||||
return _mm_xor_si128(out, key);
|
||||
}
|
||||
|
||||
static inline uint32_t sub_word(uint32_t key)
|
||||
{
|
||||
return (saes_sbox[key >> 24 ] << 24) |
|
||||
(saes_sbox[(key >> 16) & 0xff] << 16 ) |
|
||||
(saes_sbox[(key >> 8) & 0xff] << 8 ) |
|
||||
saes_sbox[key & 0xff];
|
||||
}
|
||||
|
||||
#ifndef HAVE_ROTR
|
||||
static inline uint32_t _rotr(uint32_t value, uint32_t amount)
|
||||
{
|
||||
return (value >> amount) | (value << ((32 - amount) & 31));
|
||||
}
|
||||
#endif
|
||||
|
||||
template<uint8_t rcon>
|
||||
static inline __m128i soft_aeskeygenassist(__m128i key)
|
||||
{
|
||||
const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
|
||||
const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
|
||||
return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
|
||||
}
|
||||
Reference in New Issue
Block a user