Merge xmrig v6.16.1 into master

2025-12-24 21:32:47 -05:00 · 2021-11-29 15:34:55 +00:00
parent dabb2f7118 00990f2649
commit ecdb1929e2
121 changed files with 1944 additions and 1118 deletions
--- a/src/crypto/cn/CnHash.cpp
+++ b/src/crypto/cn/CnHash.cpp
@@ -354,7 +354,7 @@ xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av,

 #   ifdef XMRIG_ALGO_CN_HEAVY
    // cn-heavy optimization for Zen3 CPUs
-    if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
+    if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) && (Cpu::info()->model() == 0x21)) {
        switch (algorithm.id()) {
        case Algorithm::CN_HEAVY_0:
            return cryptonight_single_hash<Algorithm::CN_HEAVY_0, false, 3>;
--- a/src/crypto/cn/CryptoNight_test.h
+++ b/src/crypto/cn/CryptoNight_test.h
@@ -466,6 +466,29 @@ const static uint8_t astrobwt_dero_test_out[256] = {
 #endif


+#ifdef XMRIG_ALGO_GHOSTRIDER
+// "GhostRider"
+const static uint8_t test_output_gr[256] = {
+    0x42, 0x17, 0x0C, 0xC1, 0x85, 0xE6, 0x76, 0x3C, 0xC7, 0xCB, 0x27, 0xC4, 0x17, 0x39, 0x2D, 0xE2,
+    0x29, 0x6B, 0x40, 0x66, 0x85, 0xA4, 0xE3, 0xD3, 0x8C, 0xE9, 0xA5, 0x8F, 0x10, 0xFC, 0x81, 0xE4,
+    0x90, 0x56, 0xF2, 0x9E, 0x00, 0xD0, 0xF8, 0xA1, 0x88, 0x82, 0x86, 0xC0, 0x86, 0x04, 0x6B, 0x0E,
+    0x9A, 0xDB, 0xDB, 0xFD, 0x23, 0x16, 0x77, 0x94, 0xFE, 0x58, 0x93, 0x05, 0x10, 0x3F, 0x27, 0x75,
+    0x51, 0x44, 0xF3, 0x5F, 0xE2, 0xF9, 0x61, 0xBE, 0xC0, 0x30, 0xB5, 0x8E, 0xB1, 0x1B, 0xA1, 0xF7,
+    0x06, 0x4E, 0xF1, 0x6A, 0xFD, 0xA5, 0x44, 0x8E, 0x64, 0x47, 0x8C, 0x67, 0x51, 0xE2, 0x5C, 0x55,
+    0x3E, 0x39, 0xA6, 0xA5, 0xF7, 0xB8, 0xD0, 0x5E, 0xE2, 0xBF, 0x92, 0x44, 0xD9, 0xAA, 0x76, 0x22,
+    0xE3, 0x3E, 0x15, 0x96, 0xD8, 0x6A, 0x78, 0x2D, 0xA9, 0x77, 0x24, 0x1A, 0x4B, 0xE7, 0x5A, 0x2E,
+    0x89, 0x77, 0xAE, 0x92, 0xE4, 0xA4, 0x2D, 0xAF, 0x0B, 0x27, 0x09, 0xB2, 0x5F, 0x95, 0x61, 0xA9,
+    0xA8, 0xBE, 0x5D, 0x39, 0xBE, 0x41, 0x5F, 0x9C, 0x67, 0x28, 0x48, 0x4F, 0xAE, 0x2A, 0x50, 0x2B,
+    0xB8, 0xC7, 0x42, 0x73, 0x51, 0x60, 0x59, 0xD8, 0x9C, 0xBA, 0x22, 0x2F, 0x8E, 0x34, 0xDE, 0xC8,
+    0x1B, 0xAE, 0x9E, 0xBD, 0xF7, 0xE8, 0xFD, 0x8A, 0x97, 0xBE, 0xF0, 0x47, 0xAC, 0x27, 0xDD, 0x28,
+    0xC9, 0x28, 0xA8, 0x7B, 0x2A, 0xB8, 0x90, 0x3E, 0xCA, 0xB4, 0x78, 0x44, 0xCE, 0xCD, 0x91, 0xEC,
+    0xC2, 0x5A, 0x17, 0x59, 0x7C, 0x14, 0xF8, 0x95, 0x28, 0x14, 0xC3, 0xAD, 0xC4, 0xE1, 0x13, 0x5A,
+    0xC4, 0xA7, 0xC7, 0x77, 0xAD, 0xF8, 0x09, 0x61, 0x16, 0xBB, 0xAA, 0x7E, 0xAB, 0xC3, 0x00, 0x25,
+    0xBA, 0xA8, 0x97, 0xC7, 0x7D, 0x38, 0x46, 0x0E, 0x59, 0xAC, 0xCB, 0xAE, 0xFE, 0x3C, 0x6F, 0x01
+};
+#endif
+
+
 } // namespace xmrig


--- a/src/crypto/cn/CryptoNight_x86.h
+++ b/src/crypto/cn/CryptoNight_x86.h
@@ -43,6 +43,11 @@
 #include "crypto/cn/soft_aes.h"


+#ifdef XMRIG_VAES
+#   include "crypto/cn/CryptoNight_x86_vaes.h"
+#endif
+
+
 extern "C"
 {
 #include "crypto/cn/c_groestl.h"
@@ -289,6 +294,13 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
 {
    constexpr CnAlgo<ALGO> props;

+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes<ALGO>(ctx);
+        return;
+    }
+#   endif
+
    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);

    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
@@ -341,7 +353,7 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
    constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
    constexpr int prefetch_dist = 2048 / sizeof(__m128i);

-    __m128i* e = output + N - prefetch_dist;
+    __m128i* e = output + (N << interleave) - prefetch_dist;
    __m128i* prefetch_ptr = output + prefetch_dist;

    for (int i = 0; i < 2; ++i) {
@@ -396,6 +408,13 @@ static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx)
 {
    constexpr CnAlgo<ALGO> props;

+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes<ALGO>(ctx);
+        return;
+    }
+#   endif
+
 #   ifdef XMRIG_ALGO_CN_GPU
    constexpr bool IS_HEAVY = props.isHeavy() || ALGO == Algorithm::CN_GPU;
 #   else
@@ -1070,8 +1089,17 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
        ctx[0]->first_half = true;
        ctx[1]->first_half = true;
    }
-    cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
-    cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }

    if (ALGO == Algorithm::CN_2) {
        cnv2_double_mainloop_sandybridge_asm(ctx);
@@ -1110,8 +1138,16 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
        ctx[0]->generated_code(ctx);
    }

-    cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
-    cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }

    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
    keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
@@ -1166,8 +1202,17 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
        ctx[0]->first_half = true;
        ctx[1]->first_half = true;
    }
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    }

    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t al1 = h1[0] ^ h1[4];
@@ -1362,8 +1407,16 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
        bx10 = cx1;
    }

-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    }

    keccakf(h0, 24);
    keccakf(h1, 24);
@@ -1424,10 +1477,19 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u
        ctx[3]->first_half = true;
    }

-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }

    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t al1 = h1[0] ^ h1[4];
@@ -1548,10 +1610,19 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u
        if (!SOFT_AES) cx3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l3[idx3 & MASK]));
    }

-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }

    keccakf(h0, 24);
    keccakf(h1, 24);
@@ -1788,7 +1859,20 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
        if (props.half_mem()) {
            ctx[i]->first_half = true;
        }
-        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
+    }
+
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
    }

    uint8_t* l0  = ctx[0]->memory;
@@ -1840,8 +1924,21 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
        CN_STEP4(3, ax3, bx30, bx31, cx3, l3, mc3, ptr3, idx3);
    }

+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }
+
    for (size_t i = 0; i < 4; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
--- a/src/crypto/cn/CryptoNight_x86_vaes.cpp
+++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp
@@ -0,0 +1,530 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "CryptoNight_x86_vaes.h"
+#include "CryptoNight_monero.h"
+#include "CryptoNight.h"
+
+
+#ifdef __GNUC__
+#   include <x86intrin.h>
+#if !defined(__clang__) && !defined(__ICC) && __GNUC__ < 10
+static inline __m256i
+__attribute__((__always_inline__))
+  _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr)
+{
+    return _mm256_inserti128_si256(
+            _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
+}
+
+static inline void
+__attribute__((__always_inline__))
+  _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
+{
+    _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
+      _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
+}
+#endif
+#else
+#   include <intrin.h>
+#endif
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static FORCEINLINE __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+template<uint8_t rcon>
+static FORCEINLINE void aes_genkey_sub(__m128i* xout0, __m128i* xout2)
+{
+    __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
+    xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    *xout0 = sl_xor(*xout0);
+    *xout0 = _mm_xor_si128(*xout0, xout1);
+    xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00);
+    xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    *xout2 = sl_xor(*xout2);
+    *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+
+static NOINLINE void vaes_genkey(const __m128i* memory, __m256i* k0, __m256i* k1, __m256i* k2, __m256i* k3, __m256i* k4, __m256i* k5, __m256i* k6, __m256i* k7, __m256i* k8, __m256i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = _mm256_set_m128i(xout0, xout0);
+    *k1 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x01>(&xout0, &xout2);
+    *k2 = _mm256_set_m128i(xout0, xout0);
+    *k3 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x02>(&xout0, &xout2);
+    *k4 = _mm256_set_m128i(xout0, xout0);
+    *k5 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x04>(&xout0, &xout2);
+    *k6 = _mm256_set_m128i(xout0, xout0);
+    *k7 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x08>(&xout0, &xout2);
+    *k8 = _mm256_set_m128i(xout0, xout0);
+    *k9 = _mm256_set_m128i(xout2, xout2);
+}
+
+
+static NOINLINE void vaes_genkey_double(const __m128i* memory1, const __m128i* memory2, __m256i* k0, __m256i* k1, __m256i* k2, __m256i* k3, __m256i* k4, __m256i* k5, __m256i* k6, __m256i* k7, __m256i* k8, __m256i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory1);
+    __m128i xout1 = _mm_load_si128(memory1 + 1);
+    __m128i xout2 = _mm_load_si128(memory2);
+    __m128i xout3 = _mm_load_si128(memory2 + 1);
+    *k0 = _mm256_set_m128i(xout2, xout0);
+    *k1 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x01>(&xout0, &xout1);
+    aes_genkey_sub<0x01>(&xout2, &xout3);
+    *k2 = _mm256_set_m128i(xout2, xout0);
+    *k3 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x02>(&xout0, &xout1);
+    aes_genkey_sub<0x02>(&xout2, &xout3);
+    *k4 = _mm256_set_m128i(xout2, xout0);
+    *k5 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x04>(&xout0, &xout1);
+    aes_genkey_sub<0x04>(&xout2, &xout3);
+    *k6 = _mm256_set_m128i(xout2, xout0);
+    *k7 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x08>(&xout0, &xout1);
+    aes_genkey_sub<0x08>(&xout2, &xout3);
+    *k8 = _mm256_set_m128i(xout2, xout0);
+    *k9 = _mm256_set_m128i(xout3, xout1);
+}
+
+
+static FORCEINLINE void vaes_round(__m256i key, __m256i& x01, __m256i& x23, __m256i& x45, __m256i& x67)
+{
+    x01 = _mm256_aesenc_epi128(x01, key);
+    x23 = _mm256_aesenc_epi128(x23, key);
+    x45 = _mm256_aesenc_epi128(x45, key);
+    x67 = _mm256_aesenc_epi128(x67, key);
+}
+
+
+static FORCEINLINE void vaes_round(__m256i key, __m256i& x0, __m256i& x1, __m256i& x2, __m256i& x3, __m256i& x4, __m256i& x5, __m256i& x6, __m256i& x7)
+{
+    x0 = _mm256_aesenc_epi128(x0, key);
+    x1 = _mm256_aesenc_epi128(x1, key);
+    x2 = _mm256_aesenc_epi128(x2, key);
+    x3 = _mm256_aesenc_epi128(x3, key);
+    x4 = _mm256_aesenc_epi128(x4, key);
+    x5 = _mm256_aesenc_epi128(x5, key);
+    x6 = _mm256_aesenc_epi128(x6, key);
+    x7 = _mm256_aesenc_epi128(x7, key);
+}
+
+
+namespace xmrig {
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xin01, xin23, xin45, xin67;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input = reinterpret_cast<const __m128i*>(ctx->state);
+    __m256i* output = reinterpret_cast<__m256i*>(ctx->memory);
+
+    vaes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    if (props.half_mem() && !ctx->first_half) {
+        const __m256i* p = reinterpret_cast<const __m256i*>(ctx->save_state);
+        xin01 = _mm256_load_si256(p + 0);
+        xin23 = _mm256_load_si256(p + 1);
+        xin45 = _mm256_load_si256(p + 2);
+        xin67 = _mm256_load_si256(p + 3);
+    }
+    else {
+        xin01 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 4));
+        xin23 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 6));
+        xin45 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 8));
+        xin67 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 10));
+    }
+
+    constexpr int output_increment = 64 / sizeof(__m256i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m256i);
+
+    __m256i* e = output + N - prefetch_dist;
+    __m256i* prefetch_ptr = output + prefetch_dist;
+
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr + output_increment), _MM_HINT_T0);
+
+            vaes_round(k0, xin01, xin23, xin45, xin67);
+            vaes_round(k1, xin01, xin23, xin45, xin67);
+            vaes_round(k2, xin01, xin23, xin45, xin67);
+            vaes_round(k3, xin01, xin23, xin45, xin67);
+            vaes_round(k4, xin01, xin23, xin45, xin67);
+            vaes_round(k5, xin01, xin23, xin45, xin67);
+            vaes_round(k6, xin01, xin23, xin45, xin67);
+            vaes_round(k7, xin01, xin23, xin45, xin67);
+            vaes_round(k8, xin01, xin23, xin45, xin67);
+            vaes_round(k9, xin01, xin23, xin45, xin67);
+
+            _mm256_store_si256(output + 0, xin01);
+            _mm256_store_si256(output + 1, xin23);
+
+            _mm256_store_si256(output + output_increment + 0, xin45);
+            _mm256_store_si256(output + output_increment + 1, xin67);
+
+            output += output_increment * 2;
+            prefetch_ptr += output_increment * 2;
+        } while (output < e);
+        e += prefetch_dist;
+        prefetch_ptr = output;
+    }
+
+    if (props.half_mem() && ctx->first_half) {
+        __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state);
+        _mm256_store_si256(p + 0, xin01);
+        _mm256_store_si256(p + 1, xin23);
+        _mm256_store_si256(p + 2, xin45);
+        _mm256_store_si256(p + 3, xin67);
+    }
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input1 = reinterpret_cast<const __m128i*>(ctx1->state);
+    const __m128i* input2 = reinterpret_cast<const __m128i*>(ctx2->state);
+
+    __m128i* output1 = reinterpret_cast<__m128i*>(ctx1->memory);
+    __m128i* output2 = reinterpret_cast<__m128i*>(ctx2->memory);
+
+    vaes_genkey_double(input1, input2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    {
+        const bool b = props.half_mem() && !ctx1->first_half && !ctx2->first_half;
+        const __m128i* p1 = b ? reinterpret_cast<const __m128i*>(ctx1->save_state) : (input1 + 4);
+        const __m128i* p2 = b ? reinterpret_cast<const __m128i*>(ctx2->save_state) : (input2 + 4);
+        xin0 = _mm256_loadu2_m128i(p2 + 0, p1 + 0);
+        xin1 = _mm256_loadu2_m128i(p2 + 1, p1 + 1);
+        xin2 = _mm256_loadu2_m128i(p2 + 2, p1 + 2);
+        xin3 = _mm256_loadu2_m128i(p2 + 3, p1 + 3);
+        xin4 = _mm256_loadu2_m128i(p2 + 4, p1 + 4);
+        xin5 = _mm256_loadu2_m128i(p2 + 5, p1 + 5);
+        xin6 = _mm256_loadu2_m128i(p2 + 6, p1 + 6);
+        xin7 = _mm256_loadu2_m128i(p2 + 7, p1 + 7);
+    }
+
+    constexpr int output_increment = 64 / sizeof(__m128i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m128i);
+
+    __m128i* e = output1 + N - prefetch_dist;
+    __m128i* prefetch_ptr1 = output1 + prefetch_dist;
+    __m128i* prefetch_ptr2 = output2 + prefetch_dist;
+
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr1), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr1 + output_increment), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr2), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr2 + output_increment), _MM_HINT_T0);
+
+            vaes_round(k0, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k1, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k2, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k3, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k4, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k5, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k6, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k7, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k8, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k9, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+
+            _mm256_storeu2_m128i(output2 + 0, output1 + 0, xin0);
+            _mm256_storeu2_m128i(output2 + 1, output1 + 1, xin1);
+            _mm256_storeu2_m128i(output2 + 2, output1 + 2, xin2);
+            _mm256_storeu2_m128i(output2 + 3, output1 + 3, xin3);
+
+            _mm256_storeu2_m128i(output2 + output_increment + 0, output1 + output_increment + 0, xin4);
+            _mm256_storeu2_m128i(output2 + output_increment + 1, output1 + output_increment + 1, xin5);
+            _mm256_storeu2_m128i(output2 + output_increment + 2, output1 + output_increment + 2, xin6);
+            _mm256_storeu2_m128i(output2 + output_increment + 3, output1 + output_increment + 3, xin7);
+
+            output1 += output_increment * 2;
+            prefetch_ptr1 += output_increment * 2;
+            output2 += output_increment * 2;
+            prefetch_ptr2 += output_increment * 2;
+        } while (output1 < e);
+        e += prefetch_dist;
+        prefetch_ptr1 = output1;
+        prefetch_ptr2 = output2;
+    }
+
+    if (props.half_mem() && ctx1->first_half && ctx2->first_half) {
+        __m128i* p1 = reinterpret_cast<__m128i*>(ctx1->save_state);
+        __m128i* p2 = reinterpret_cast<__m128i*>(ctx2->save_state);
+        _mm256_storeu2_m128i(p2 + 0, p1 + 0, xin0);
+        _mm256_storeu2_m128i(p2 + 1, p1 + 1, xin1);
+        _mm256_storeu2_m128i(p2 + 2, p1 + 2, xin2);
+        _mm256_storeu2_m128i(p2 + 3, p1 + 3, xin3);
+        _mm256_storeu2_m128i(p2 + 4, p1 + 4, xin4);
+        _mm256_storeu2_m128i(p2 + 5, p1 + 5, xin5);
+        _mm256_storeu2_m128i(p2 + 6, p1 + 6, xin6);
+        _mm256_storeu2_m128i(p2 + 7, p1 + 7, xin7);
+    }
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xout01, xout23, xout45, xout67;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m256i* input = reinterpret_cast<const __m256i*>(ctx->memory);
+    __m256i* output = reinterpret_cast<__m256i*>(ctx->state);
+
+    vaes_genkey(reinterpret_cast<__m128i*>(output) + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout01 = _mm256_load_si256(output + 2);
+    xout23 = _mm256_load_si256(output + 3);
+    xout45 = _mm256_load_si256(output + 4);
+    xout67 = _mm256_load_si256(output + 5);
+
+    const __m256i* input_begin = input;
+    for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
+        if (props.half_mem() && (part == 1)) {
+            input = input_begin;
+            ctx->first_half = false;
+            cn_explode_scratchpad_vaes<ALGO>(ctx);
+        }
+
+        for (size_t i = 0; i < N;) {
+            xout01 = _mm256_xor_si256(xout01, input[0]);
+            xout23 = _mm256_xor_si256(xout23, input[1]);
+
+            constexpr int input_increment = 64 / sizeof(__m256i);
+
+            xout45 = _mm256_xor_si256(xout45, input[input_increment]);
+            xout67 = _mm256_xor_si256(xout67, input[input_increment + 1]);
+
+            input += input_increment * 2;
+            i += 4;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input + input_increment), _MM_HINT_T0);
+            }
+
+            vaes_round(k0, xout01, xout23, xout45, xout67);
+            vaes_round(k1, xout01, xout23, xout45, xout67);
+            vaes_round(k2, xout01, xout23, xout45, xout67);
+            vaes_round(k3, xout01, xout23, xout45, xout67);
+            vaes_round(k4, xout01, xout23, xout45, xout67);
+            vaes_round(k5, xout01, xout23, xout45, xout67);
+            vaes_round(k6, xout01, xout23, xout45, xout67);
+            vaes_round(k7, xout01, xout23, xout45, xout67);
+            vaes_round(k8, xout01, xout23, xout45, xout67);
+            vaes_round(k9, xout01, xout23, xout45, xout67);
+        }
+    }
+
+    _mm256_store_si256(output + 2, xout01);
+    _mm256_store_si256(output + 3, xout23);
+    _mm256_store_si256(output + 4, xout45);
+    _mm256_store_si256(output + 5, xout67);
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input1 = reinterpret_cast<const __m128i*>(ctx1->memory);
+    const __m128i* input2 = reinterpret_cast<const __m128i*>(ctx2->memory);
+
+    __m128i* output1 = reinterpret_cast<__m128i*>(ctx1->state);
+    __m128i* output2 = reinterpret_cast<__m128i*>(ctx2->state);
+
+    vaes_genkey_double(output1 + 2, output2 + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm256_loadu2_m128i(output2 + 4, output1 + 4);
+    xout1 = _mm256_loadu2_m128i(output2 + 5, output1 + 5);
+    xout2 = _mm256_loadu2_m128i(output2 + 6, output1 + 6);
+    xout3 = _mm256_loadu2_m128i(output2 + 7, output1 + 7);
+    xout4 = _mm256_loadu2_m128i(output2 + 8, output1 + 8);
+    xout5 = _mm256_loadu2_m128i(output2 + 9, output1 + 9);
+    xout6 = _mm256_loadu2_m128i(output2 + 10, output1 + 10);
+    xout7 = _mm256_loadu2_m128i(output2 + 11, output1 + 11);
+
+    const __m128i* input_begin1 = input1;
+    const __m128i* input_begin2 = input2;
+    for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
+        if (props.half_mem() && (part == 1)) {
+            input1 = input_begin1;
+            input2 = input_begin2;
+            ctx1->first_half = false;
+            ctx2->first_half = false;
+            cn_explode_scratchpad_vaes_double<ALGO>(ctx1, ctx2);
+        }
+
+        for (size_t i = 0; i < N;) {
+            xout0 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 0, input1 + 0), xout0);
+            xout1 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 1, input1 + 1), xout1);
+            xout2 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 2, input1 + 2), xout2);
+            xout3 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 3, input1 + 3), xout3);
+
+            constexpr int input_increment = 64 / sizeof(__m128i);
+
+            xout4 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 0, input1 + input_increment + 0), xout4);
+            xout5 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 1, input1 + input_increment + 1), xout5);
+            xout6 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 2, input1 + input_increment + 2), xout6);
+            xout7 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 3, input1 + input_increment + 3), xout7);
+
+            input1 += input_increment * 2;
+            input2 += input_increment * 2;
+            i += 8;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input1), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input1 + input_increment), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input2), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input2 + input_increment), _MM_HINT_T0);
+            }
+
+            vaes_round(k0, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k1, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k2, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k3, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k4, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k5, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k6, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k7, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k8, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k9, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+        }
+    }
+
+    _mm256_storeu2_m128i(output2 + 4, output1 + 4, xout0);
+    _mm256_storeu2_m128i(output2 + 5, output1 + 5, xout1);
+    _mm256_storeu2_m128i(output2 + 6, output1 + 6, xout2);
+    _mm256_storeu2_m128i(output2 + 7, output1 + 7, xout3);
+    _mm256_storeu2_m128i(output2 + 8, output1 + 8, xout4);
+    _mm256_storeu2_m128i(output2 + 9, output1 + 9, xout5);
+    _mm256_storeu2_m128i(output2 + 10, output1 + 10, xout6);
+    _mm256_storeu2_m128i(output2 + 11, output1 + 11, xout7);
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+void VAES_Instance()
+{
+    cn_explode_scratchpad_vaes<ALGO>(nullptr);
+    cn_explode_scratchpad_vaes_double<ALGO>(nullptr, nullptr);
+    cn_implode_scratchpad_vaes<ALGO>(nullptr);
+    cn_implode_scratchpad_vaes_double<ALGO>(nullptr, nullptr);
+}
+
+
+void (*vaes_instances[])() = {
+    VAES_Instance<Algorithm::CN_0>,
+    VAES_Instance<Algorithm::CN_1>,
+    VAES_Instance<Algorithm::CN_2>,
+    VAES_Instance<Algorithm::CN_R>,
+    VAES_Instance<Algorithm::CN_FAST>,
+    VAES_Instance<Algorithm::CN_HALF>,
+    VAES_Instance<Algorithm::CN_XAO>,
+    VAES_Instance<Algorithm::CN_RTO>,
+    VAES_Instance<Algorithm::CN_RWZ>,
+    VAES_Instance<Algorithm::CN_ZLS>,
+    VAES_Instance<Algorithm::CN_DOUBLE>,
+    VAES_Instance<Algorithm::CN_CCX>,
+    VAES_Instance<Algorithm::CN_LITE_0>,
+    VAES_Instance<Algorithm::CN_LITE_1>,
+    VAES_Instance<Algorithm::CN_HEAVY_0>,
+    VAES_Instance<Algorithm::CN_HEAVY_TUBE>,
+    VAES_Instance<Algorithm::CN_HEAVY_XHV>,
+    VAES_Instance<Algorithm::CN_PICO_0>,
+    VAES_Instance<Algorithm::CN_PICO_TLO>,
+    VAES_Instance<Algorithm::CN_UPX2>,
+    VAES_Instance<Algorithm::CN_GR_0>,
+    VAES_Instance<Algorithm::CN_GR_1>,
+    VAES_Instance<Algorithm::CN_GR_2>,
+    VAES_Instance<Algorithm::CN_GR_3>,
+    VAES_Instance<Algorithm::CN_GR_4>,
+    VAES_Instance<Algorithm::CN_GR_5>,
+};
+
+
+} // xmrig
--- a/src/crypto/cn/CryptoNight_x86_vaes.h
+++ b/src/crypto/cn/CryptoNight_x86_vaes.h
@@ -0,0 +1,48 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_X86_VAES_H
+#define XMRIG_CRYPTONIGHT_X86_VAES_H
+
+
+#include "crypto/cn/CnAlgo.h"
+
+
+struct cryptonight_ctx;
+
+
+namespace xmrig {
+
+
+template<Algorithm::Id ALGO> void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx);
+template<Algorithm::Id ALGO> void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2);
+template<Algorithm::Id ALGO> void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx);
+template<Algorithm::Id ALGO> void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2);
+
+
+} // xmrig
+
+
+#endif /* XMRIG_CRYPTONIGHT_X86_VAES_H */
--- a/src/crypto/cn/c_groestl.c
+++ b/src/crypto/cn/c_groestl.c
@@ -4,7 +4,7 @@
 *
 *  This work is based on the implementation of
 *          Soeren S. Thomsen and Krystian Matusiewicz
- *          
+ *
 *
 */

@@ -22,7 +22,7 @@ const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
 #define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
 															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
 															v1 = temp_var;}
-  
+

 #define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
   tu = T[2*(uint32_t)x[4*c0+0]];			    \
@@ -161,11 +161,11 @@ static void F512(uint32_t *h, const uint32_t *m) {

 /* digest up to msglen bytes of input (full blocks only) */
 static void Transform(groestlHashState *ctx,
-	       const uint8_t *input, 
+	       const uint8_t *input,
 	       int msglen) {

  /* digest message, one block at a time */
-  for (; msglen >= SIZE512; 
+  for (; msglen >= SIZE512;
       msglen -= SIZE512, input += SIZE512) {
    F512(ctx->chaining,(uint32_t*)input);

@@ -199,7 +199,7 @@ static void OutputTransformation(groestlHashState *ctx) {
 	RND512P((uint8_t*)y, temp, 0x00000009);
 	for (j = 0; j < 2*COLS512; j++) {
 	  ctx->chaining[j] ^= temp[j];
-	}									  
+	}									
 }

 /* initialise context */
@@ -313,7 +313,7 @@ static void Final(groestlHashState* ctx,
    ctx->block_counter2 >>= 8;
  }
  /* digest final padding block */
-  Transform(ctx, ctx->buffer, SIZE512); 
+  Transform(ctx, ctx->buffer, SIZE512);
  /* perform output transformation */
  OutputTransformation(ctx);

@@ -332,7 +332,7 @@ static void Final(groestlHashState* ctx,
 }

 /* hash bit sequence */
-void groestl(const BitSequence* data, 
+void groestl(const BitSequence* data,
 		DataLength databitlen,
 		BitSequence* hashval) {

--- a/src/crypto/cn/c_groestl.h
+++ b/src/crypto/cn/c_groestl.h
@@ -4,10 +4,10 @@
 #include "crypto_uint8.h"
 #include "crypto_uint32.h"
 #include "crypto_uint64.h"
-#include "crypto_hash.h" 
+#include "crypto_hash.h"

-typedef crypto_uint8 uint8_t; 
-typedef crypto_uint32 uint32_t; 
+typedef crypto_uint8 uint8_t;
+typedef crypto_uint32 uint32_t;
 typedef crypto_uint64 uint64_t;
 */
 #include <stdint.h>
--- a/src/crypto/cn/c_skein.c
+++ b/src/crypto/cn/c_skein.c
@@ -5,7 +5,7 @@
 ** Source code author: Doug Whiting, 2008.
 **
 ** This algorithm and source code is released to the public domain.
-** 
+**
 ************************************************************************/

 #define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
@@ -57,7 +57,7 @@ static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);

 /*****************************************************************
 ** "Internal" Skein definitions
-**    -- not needed for sequential hashing API, but will be 
+**    -- not needed for sequential hashing API, but will be
 **           helpful for other uses of Skein (e.g., tree hash mode).
 **    -- included here so that they can be shared between
 **           reference and optimized code.
@@ -179,11 +179,11 @@ static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
 #define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
 #define Skein_assert(x)
 #elif   defined(SKEIN_ASSERT)
-#include <assert.h>     
-#define Skein_Assert(x,retCode) assert(x) 
-#define Skein_assert(x)         assert(x) 
+#include <assert.h>
+#define Skein_Assert(x,retCode) assert(x)
+#define Skein_assert(x)         assert(x)
 #else
-#include <assert.h>     
+#include <assert.h>
 #define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
 #define Skein_assert(x)         assert(x)                     /* internal error */
 #endif
@@ -191,8 +191,8 @@ static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
 /*****************************************************************
 ** Skein block function constants (shared across Ref and Opt code)
 ******************************************************************/
-enum    
-{   
+enum
+{
  /* Skein_512 round rotation constants */
  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
@@ -251,7 +251,7 @@ const u64b_t SKEIN_512_IV_256[] =
 #define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
 #define KW_TWK_BASE     (0)
 #define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)                
+#define ks              (kw + KW_KEY_BASE)
 #define ts              (kw + KW_TWK_BASE)

 #ifdef SKEIN_DEBUG
@@ -310,7 +310,7 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
        ks[5] = ctx->X[5];
        ks[6] = ctx->X[6];
        ks[7] = ctx->X[7];
-        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;

        ts[2] = ts[0] ^ ts[1];
@@ -338,7 +338,7 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \

-#if SKEIN_UNROLL_512 == 0                       
+#if SKEIN_UNROLL_512 == 0
 #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
@@ -469,7 +469,7 @@ static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
        u08b_t  b[SKEIN_512_STATE_BYTES];
        u64b_t  w[SKEIN_512_STATE_WORDS];
        } cfg;                              /* config block */
-        
+
    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */

@@ -548,7 +548,7 @@ static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msg

    return SKEIN_SUCCESS;
    }
-   
+
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
 static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
@@ -562,7 +562,7 @@ static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);

    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
-    
+
    /* now output the result */
    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */

--- a/src/crypto/cn/c_skein.h
+++ b/src/crypto/cn/c_skein.h
@@ -9,7 +9,7 @@
 ** This algorithm and source code is released to the public domain.
 **
 ***************************************************************************
-** 
+**
 ** The following compile-time switches may be defined to control some
 ** tradeoffs between speed, code size, error checking, and security.
 **
@@ -20,8 +20,8 @@
 **                            [default: no callouts (no overhead)]
 **
 **  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
-**                            code. If not defined, most error checking 
-**                            is disabled (for performance). Otherwise, 
+**                            code. If not defined, most error checking
+**                            is disabled (for performance). Otherwise,
 **                            the switch value is interpreted as:
 **                                0: use assert()      to flag errors
 **                                1: return SKEIN_FAIL to flag errors
--- a/src/crypto/cn/soft_aes.h
+++ b/src/crypto/cn/soft_aes.h
@@ -124,9 +124,9 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key)

 static inline uint32_t sub_word(uint32_t key)
 {
-    return (saes_sbox[key >> 24 ] << 24)   | 
-        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
-        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+    return (saes_sbox[key >> 24 ] << 24)   |
+        (saes_sbox[(key >> 16) & 0xff] << 16 ) |
+        (saes_sbox[(key >> 8)  & 0xff] << 8  ) |
         saes_sbox[key & 0xff];
 }