Move files.

2026-04-18 13:22:43 -04:00 · 2019-06-04 19:20:33 +07:00
parent ac43cd4f9c
commit d587eebaf2
70 changed files with 69 additions and 70 deletions
--- a/src/crypto/cn/Asm.cpp
+++ b/src/crypto/cn/Asm.cpp
@@ -0,0 +1,102 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <assert.h>
+#include <string.h>
+
+
+#ifdef _MSC_VER
+#   define strncasecmp _strnicmp
+#   define strcasecmp  _stricmp
+#endif
+
+
+#include "crypto/cn/Asm.h"
+#include "rapidjson/document.h"
+
+
+static const char *asmNames[] = {
+    "none",
+    "auto",
+    "intel",
+    "ryzen",
+    "bulldozer"
+};
+
+
+xmrig::Assembly xmrig::Asm::parse(const char *assembly, Assembly defaultValue)
+{
+    constexpr size_t const size = sizeof(asmNames) / sizeof((asmNames)[0]);
+    assert(assembly != nullptr);
+    assert(ASM_MAX == size);
+
+    if (assembly == nullptr) {
+        return defaultValue;
+    }
+
+    for (size_t i = 0; i < size; i++) {
+        if (strcasecmp(assembly, asmNames[i]) == 0) {
+            return static_cast<Assembly>(i);
+        }
+    }
+
+    return defaultValue;
+}
+
+
+xmrig::Assembly xmrig::Asm::parse(const rapidjson::Value &value, Assembly defaultValue)
+{
+    if (value.IsBool()) {
+        return parse(value.GetBool());
+    }
+
+    if (value.IsString()) {
+        return parse(value.GetString(), defaultValue);
+    }
+
+    return defaultValue;
+}
+
+
+const char *xmrig::Asm::toString(Assembly assembly)
+{
+    return asmNames[assembly];
+}
+
+
+rapidjson::Value xmrig::Asm::toJSON(Assembly assembly)
+{
+    using namespace rapidjson;
+
+    if (assembly == ASM_NONE) {
+        return Value(false);
+    }
+
+    if (assembly == ASM_AUTO) {
+        return Value(true);
+    }
+
+    return Value(StringRef(toString(assembly)));
+}
--- a/src/crypto/cn/Asm.h
+++ b/src/crypto/cn/Asm.h
@@ -0,0 +1,50 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_ASM_H
+#define XMRIG_ASM_H
+
+
+#include "common/xmrig.h"
+#include "rapidjson/fwd.h"
+
+
+namespace xmrig {
+
+
+class Asm
+{
+public:
+    static Assembly parse(const char *assembly, Assembly defaultValue = ASM_AUTO);
+    static Assembly parse(const rapidjson::Value &value, Assembly defaultValue = ASM_AUTO);
+    static const char *toString(Assembly assembly);
+    static rapidjson::Value toJSON(Assembly assembly);
+
+    inline static Assembly parse(bool enable) { return enable ? ASM_AUTO : ASM_NONE; }
+};
+
+
+} /* namespace xmrig */
+
+
+#endif /* XMRIG_ASM_H */
--- a/src/crypto/cn/CryptoNight.h
+++ b/src/crypto/cn/CryptoNight.h
@@ -0,0 +1,64 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_H
+#define XMRIG_CRYPTONIGHT_H
+
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined _MSC_VER || defined XMRIG_ARM
+#   define ABI_ATTRIBUTE
+#else
+#   define ABI_ATTRIBUTE __attribute__((ms_abi))
+#endif
+
+
+struct cryptonight_ctx;
+typedef void(*cn_mainloop_fun_ms_abi)(cryptonight_ctx**) ABI_ATTRIBUTE;
+
+
+struct cryptonight_r_data {
+    int variant;
+    uint64_t height;
+
+    bool match(const int v, const uint64_t h) const { return (v == variant) && (h == height); }
+};
+
+
+struct cryptonight_ctx {
+    alignas(16) uint8_t state[224];
+    alignas(16) uint8_t *memory;
+
+    uint8_t unused[40];
+    const uint32_t *saes_table;
+
+    cn_mainloop_fun_ms_abi generated_code;
+    cryptonight_r_data generated_code_data;
+};
+
+
+#endif /* XMRIG_CRYPTONIGHT_H */
--- a/src/crypto/cn/CryptoNight_arm.h
+++ b/src/crypto/cn/CryptoNight_arm.h
@@ -0,0 +1,844 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik  <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler       <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones  <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466     <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee    <jayddee246@gmail.com>
+ * Copyright 2016      Imran Yusuff <https://github.com/imranyusuff>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_ARM_H
+#define XMRIG_CRYPTONIGHT_ARM_H
+
+
+#include "common/crypto/keccak.h"
+#include "crypto/common/portable/mm_malloc.h"
+#include "crypto/cn/CryptoNight_constants.h"
+#include "crypto/cn/CryptoNight_monero.h"
+#include "crypto/cn/CryptoNight.h"
+#include "crypto/cn/soft_aes.h"
+
+
+extern "C"
+{
+#include "crypto/cn/c_groestl.h"
+#include "crypto/cn/c_blake256.h"
+#include "crypto/cn/c_jh.h"
+#include "crypto/cn/c_skein.h"
+}
+
+
+static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    blake256_hash(output, input, len);
+}
+
+
+static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    groestl(input, len * 8, output);
+}
+
+
+static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    jh_hash(32 * 8, input, 8 * len, output);
+}
+
+
+static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    xmr_skein(input, output);
+}
+
+
+void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+
+
+static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64_t a, const uint64_t b)
+{
+    return vcombine_u64(vcreate_u64(b), vcreate_u64(a));
+}
+
+
+#if __ARM_FEATURE_CRYPTO
+static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey)
+{
+    alignas(16) const __m128i zero = { 0 };
+    return veorq_u8(vaesmcq_u8(vaeseq_u8(v, zero)), rkey );
+}
+#else
+static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey)
+{
+    alignas(16) const __m128i zero = { 0 };
+    return zero;
+}
+#endif
+
+
+/* this one was not implemented yet so here it is */
+static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_u64(a, 0);
+}
+
+
+#if defined (__arm64__) || defined (__aarch64__)
+static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
+{
+    unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
+    *hi = r >> 64;
+    return (uint64_t) r;
+}
+#else
+static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
+    // multiplier   = ab = a * 2^32 + b
+    // multiplicand = cd = c * 2^32 + d
+    // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+    uint64_t a = multiplier >> 32;
+    uint64_t b = multiplier & 0xFFFFFFFF;
+    uint64_t c = multiplicand >> 32;
+    uint64_t d = multiplicand & 0xFFFFFFFF;
+
+    //uint64_t ac = a * c;
+    uint64_t ad = a * d;
+    //uint64_t bc = b * c;
+    uint64_t bd = b * d;
+
+    uint64_t adbc = ad + (b * c);
+    uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+    // multiplier * multiplicand = product_hi * 2^64 + product_lo
+    uint64_t product_lo = bd + (adbc << 32);
+    uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+    *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+    return product_lo;
+}
+#endif
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static inline __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+template<uint8_t rcon>
+static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2)
+{
+    __m128i xout1 = soft_aeskeygenassist<rcon>(*xout2);
+    xout1  = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    *xout0 = sl_xor(*xout0);
+    *xout0 = _mm_xor_si128(*xout0, xout1);
+    xout1  = soft_aeskeygenassist<0x00>(*xout0);
+    xout1  = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    *xout2 = sl_xor(*xout2);
+    *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+
+template<bool SOFT_AES>
+static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = xout0;
+    *k1 = xout2;
+
+    soft_aes_genkey_sub<0x01>(&xout0, &xout2);
+    *k2 = xout0;
+    *k3 = xout2;
+
+    soft_aes_genkey_sub<0x02>(&xout0, &xout2);
+    *k4 = xout0;
+    *k5 = xout2;
+
+    soft_aes_genkey_sub<0x04>(&xout0, &xout2);
+    *k6 = xout0;
+    *k7 = xout2;
+
+    soft_aes_genkey_sub<0x08>(&xout0, &xout2);
+    *k8 = xout0;
+    *k9 = xout2;
+}
+
+
+template<bool SOFT_AES>
+static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    if (SOFT_AES) {
+        *x0 = soft_aesenc((uint32_t*)x0, key);
+        *x1 = soft_aesenc((uint32_t*)x1, key);
+        *x2 = soft_aesenc((uint32_t*)x2, key);
+        *x3 = soft_aesenc((uint32_t*)x3, key);
+        *x4 = soft_aesenc((uint32_t*)x4, key);
+        *x5 = soft_aesenc((uint32_t*)x5, key);
+        *x6 = soft_aesenc((uint32_t*)x6, key);
+        *x7 = soft_aesenc((uint32_t*)x7, key);
+    }
+    else {
+        *x0 = _mm_aesenc_si128(*x0, key);
+        *x1 = _mm_aesenc_si128(*x1, key);
+        *x2 = _mm_aesenc_si128(*x2, key);
+        *x3 = _mm_aesenc_si128(*x3, key);
+        *x4 = _mm_aesenc_si128(*x4, key);
+        *x5 = _mm_aesenc_si128(*x5, key);
+        *x6 = _mm_aesenc_si128(*x6, key);
+        *x7 = _mm_aesenc_si128(*x7, key);
+    }
+}
+
+
+inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
+{
+    __m128i tmp0 = x0;
+    x0 = _mm_xor_si128(x0, x1);
+    x1 = _mm_xor_si128(x1, x2);
+    x2 = _mm_xor_si128(x2, x3);
+    x3 = _mm_xor_si128(x3, x4);
+    x4 = _mm_xor_si128(x4, x5);
+    x5 = _mm_xor_si128(x5, x6);
+    x6 = _mm_xor_si128(x6, x7);
+    x7 = _mm_xor_si128(x7, tmp0);
+}
+
+
+template<xmrig::Algo ALGO, size_t MEM, bool SOFT_AES>
+static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
+{
+    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xin0 = _mm_load_si128(input + 4);
+    xin1 = _mm_load_si128(input + 5);
+    xin2 = _mm_load_si128(input + 6);
+    xin3 = _mm_load_si128(input + 7);
+    xin4 = _mm_load_si128(input + 8);
+    xin5 = _mm_load_si128(input + 9);
+    xin6 = _mm_load_si128(input + 10);
+    xin7 = _mm_load_si128(input + 11);
+
+    if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
+        for (size_t i = 0; i < 16; i++) {
+            aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+
+            mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+        }
+    }
+
+    for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
+        aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+
+        _mm_store_si128(output + i + 0, xin0);
+        _mm_store_si128(output + i + 1, xin1);
+        _mm_store_si128(output + i + 2, xin2);
+        _mm_store_si128(output + i + 3, xin3);
+        _mm_store_si128(output + i + 4, xin4);
+        _mm_store_si128(output + i + 5, xin5);
+        _mm_store_si128(output + i + 6, xin6);
+        _mm_store_si128(output + i + 7, xin7);
+    }
+}
+
+
+#ifndef XMRIG_NO_CN_GPU
+template<xmrig::Algo ALGO, size_t MEM>
+void cn_explode_scratchpad_gpu(const uint8_t *input, uint8_t *output)
+{
+    constexpr size_t hash_size = 200; // 25x8 bytes
+    alignas(16) uint64_t hash[25];
+
+    for (uint64_t i = 0; i < MEM / 512; i++)
+    {
+        memcpy(hash, input, hash_size);
+        hash[0] ^= i;
+
+        xmrig::keccakf(hash, 24);
+        memcpy(output, hash, 160);
+        output += 160;
+
+        xmrig::keccakf(hash, 24);
+        memcpy(output, hash, 176);
+        output += 176;
+
+        xmrig::keccakf(hash, 24);
+        memcpy(output, hash, 176);
+        output += 176;
+    }
+}
+#endif
+
+
+template<xmrig::Algo ALGO, size_t MEM, bool SOFT_AES>
+static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
+{
+    __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm_load_si128(output + 4);
+    xout1 = _mm_load_si128(output + 5);
+    xout2 = _mm_load_si128(output + 6);
+    xout3 = _mm_load_si128(output + 7);
+    xout4 = _mm_load_si128(output + 8);
+    xout5 = _mm_load_si128(output + 9);
+    xout6 = _mm_load_si128(output + 10);
+    xout7 = _mm_load_si128(output + 11);
+
+    for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+    {
+        xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+        xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+        xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+        xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+        xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+        xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+        xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+        xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+        aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+
+        if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
+            mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+        }
+    }
+
+    if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
+        for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
+            xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+            xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+            xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+            xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+            xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+            xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+            xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+            xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+            aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+
+            mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+        }
+
+        for (size_t i = 0; i < 16; i++) {
+            aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+
+            mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+        }
+    }
+
+    _mm_store_si128(output + 4, xout0);
+    _mm_store_si128(output + 5, xout1);
+    _mm_store_si128(output + 6, xout2);
+    _mm_store_si128(output + 7, xout3);
+    _mm_store_si128(output + 8, xout4);
+    _mm_store_si128(output + 9, xout5);
+    _mm_store_si128(output + 10, xout6);
+    _mm_store_si128(output + 11, xout7);
+}
+
+
+static inline __m128i aes_round_tweak_div(const __m128i &in, const __m128i &key)
+{
+    alignas(16) uint32_t k[4];
+    alignas(16) uint32_t x[4];
+
+    _mm_store_si128((__m128i*) k, key);
+    _mm_store_si128((__m128i*) x, _mm_xor_si128(in, _mm_set_epi64x(0xffffffffffffffff, 0xffffffffffffffff)));
+
+    #define BYTE(p, i) ((unsigned char*)&x[p])[i]
+    k[0] ^= saes_table[0][BYTE(0, 0)] ^ saes_table[1][BYTE(1, 1)] ^ saes_table[2][BYTE(2, 2)] ^ saes_table[3][BYTE(3, 3)];
+    x[0] ^= k[0];
+    k[1] ^= saes_table[0][BYTE(1, 0)] ^ saes_table[1][BYTE(2, 1)] ^ saes_table[2][BYTE(3, 2)] ^ saes_table[3][BYTE(0, 3)];
+    x[1] ^= k[1];
+    k[2] ^= saes_table[0][BYTE(2, 0)] ^ saes_table[1][BYTE(3, 1)] ^ saes_table[2][BYTE(0, 2)] ^ saes_table[3][BYTE(1, 3)];
+    x[2] ^= k[2];
+    k[3] ^= saes_table[0][BYTE(3, 0)] ^ saes_table[1][BYTE(0, 1)] ^ saes_table[2][BYTE(1, 2)] ^ saes_table[3][BYTE(2, 3)];
+    #undef BYTE
+
+    return _mm_load_si128((__m128i*)k);
+}
+
+
+template<xmrig::Variant VARIANT, xmrig::Variant BASE>
+static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx)
+{
+    uint64_t* mem_out = (uint64_t*)&l[idx];
+
+    if (BASE == xmrig::VARIANT_2) {
+        VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
+        _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx));
+    } else {
+        __m128i tmp = _mm_xor_si128(bx0, cx);
+        mem_out[0] = _mm_cvtsi128_si64(tmp);
+
+        uint64_t vh = vgetq_lane_u64(tmp, 1);
+
+        uint8_t x = vh >> 24;
+        static const uint16_t table = 0x7531;
+        const uint8_t index = (((x >> (VARIANT == xmrig::VARIANT_XTL ? 4 : 3)) & 6) | (x & 1)) << 1;
+        vh ^= ((table >> index) & 0x3) << 28;
+
+        mem_out[1] = vh;
+    }
+}
+
+
+template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
+inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+    constexpr size_t MASK         = xmrig::cn_select_mask<ALGO>();
+    constexpr size_t ITERATIONS   = xmrig::cn_select_iter<ALGO, VARIANT>();
+    constexpr size_t MEM          = xmrig::cn_select_memory<ALGO>();
+    constexpr xmrig::Variant BASE = xmrig::cn_base_variant<VARIANT>();
+
+    if (BASE == xmrig::VARIANT_1 && size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    xmrig::keccak(input, size, ctx[0]->state);
+
+    cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
+
+    VARIANT1_INIT(0);
+    VARIANT2_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; i < ITERATIONS; i++) {
+        __m128i cx;
+        if (VARIANT == xmrig::VARIANT_TUBE || !SOFT_AES) {
+            cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+        }
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        if (VARIANT == xmrig::VARIANT_TUBE) {
+            cx = aes_round_tweak_div(cx, ax0);
+        }
+        else if (SOFT_AES) {
+            cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+        }
+        else {
+            cx = _mm_aesenc_si128(cx, ax0);
+        }
+
+        if (BASE == xmrig::VARIANT_1 || BASE == xmrig::VARIANT_2) {
+            cryptonight_monero_tweak<VARIANT, BASE>(l0, idx0 & MASK, ax0, bx0, bx1, cx);
+        } else {
+            _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+        }
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+        ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+        if (BASE == xmrig::VARIANT_2) {
+            if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
+                VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+                if (VARIANT == xmrig::VARIANT_4) {
+                    al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+                    ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+                }
+            } else {
+                VARIANT2_INTEGER_MATH(0, cl, cx);
+            }
+        }
+
+        lo = __umul128(idx0, cl, &hi);
+
+        if (BASE == xmrig::VARIANT_2) {
+            if (VARIANT == xmrig::VARIANT_4) {
+                VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx, 0);
+            } else {
+                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
+            }
+        }
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & MASK])[0] = al0;
+
+        if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) {
+            ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0;
+        } else if (BASE == xmrig::VARIANT_1) {
+            ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
+        } else {
+            ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
+        }
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
+            const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l0[idx0 & MASK]));
+            const int64_t n   = vgetq_lane_s64(x, 0);
+            const int32_t d   = vgetq_lane_s32(x, 2);
+            const int64_t q   = n / (d | 0x5);
+
+            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+
+            if (VARIANT == xmrig::VARIANT_XHV) {
+                idx0 = (~d) ^ q;
+            }
+            else {
+                idx0 = d ^ q;
+            }
+        }
+
+        if (BASE == xmrig::VARIANT_2) {
+            bx1 = bx0;
+        }
+
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    xmrig::keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+#ifndef XMRIG_NO_CN_GPU
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad);
+
+
+template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
+inline void cryptonight_single_hash_gpu(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+    constexpr size_t MASK         = xmrig::CRYPTONIGHT_GPU_MASK;
+    constexpr size_t ITERATIONS   = xmrig::cn_select_iter<ALGO, VARIANT>();
+    constexpr size_t MEM          = xmrig::cn_select_memory<ALGO>();
+
+    static_assert(MASK > 0 && ITERATIONS > 0 && MEM > 0, "unsupported algorithm/variant");
+
+    xmrig::keccak(input, size, ctx[0]->state);
+    cn_explode_scratchpad_gpu<ALGO, MEM>(ctx[0]->state, ctx[0]->memory);
+
+    fesetround(FE_TONEAREST);
+
+    cn_gpu_inner_arm<ITERATIONS, MASK>(ctx[0]->state, ctx[0]->memory);
+
+    cn_implode_scratchpad<xmrig::CRYPTONIGHT_HEAVY, MEM, SOFT_AES>((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    xmrig::keccakf((uint64_t*) ctx[0]->state, 24);
+    memcpy(output, ctx[0]->state, 32);
+}
+#endif
+
+
+template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
+inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+    constexpr size_t MASK         = xmrig::cn_select_mask<ALGO>();
+    constexpr size_t ITERATIONS   = xmrig::cn_select_iter<ALGO, VARIANT>();
+    constexpr size_t MEM          = xmrig::cn_select_memory<ALGO>();
+    constexpr xmrig::Variant BASE = xmrig::cn_base_variant<VARIANT>();
+
+    if (BASE == xmrig::VARIANT_1 && size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    xmrig::keccak(input,        size, ctx[0]->state);
+    xmrig::keccak(input + size, size, ctx[1]->state);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
+    uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx[1]->state);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; i < ITERATIONS; i++) {
+        __m128i cx0, cx1;
+        if (VARIANT == xmrig::VARIANT_TUBE || !SOFT_AES) {
+            cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+            cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
+        }
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+        if (VARIANT == xmrig::VARIANT_TUBE) {
+            cx0 = aes_round_tweak_div(cx0, ax0);
+            cx1 = aes_round_tweak_div(cx1, ax1);
+        }
+        else if (SOFT_AES) {
+            cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+            cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
+        }
+        else {
+            cx0 = _mm_aesenc_si128(cx0, ax0);
+            cx1 = _mm_aesenc_si128(cx1, ax1);
+        }
+
+        if (BASE == xmrig::VARIANT_1 || (BASE == xmrig::VARIANT_2)) {
+            cryptonight_monero_tweak<VARIANT, BASE>(l0, idx0 & MASK, ax0, bx00, bx01, cx0);
+            cryptonight_monero_tweak<VARIANT, BASE>(l1, idx1 & MASK, ax1, bx10, bx11, cx1);
+        } else {
+            _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx10, cx1));
+        }
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+        ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+        if (BASE == xmrig::VARIANT_2) {
+            if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
+                VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+                if (VARIANT == xmrig::VARIANT_4) {
+                    al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+                    ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+                }
+            } else {
+                VARIANT2_INTEGER_MATH(0, cl, cx0);
+            }
+        }
+
+        lo = __umul128(idx0, cl, &hi);
+
+        if (BASE == xmrig::VARIANT_2) {
+            if (VARIANT == xmrig::VARIANT_4) {
+                VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0, 0);
+            } else {
+                VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
+            }
+        }
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & MASK])[0] = al0;
+
+        if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) {
+            ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0;
+        } else if (BASE == xmrig::VARIANT_1) {
+            ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
+        } else {
+            ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
+        }
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
+            const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l0[idx0 & MASK]));
+            const int64_t n   = vgetq_lane_s64(x, 0);
+            const int32_t d   = vgetq_lane_s32(x, 2);
+            const int64_t q   = n / (d | 0x5);
+
+            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+
+            if (VARIANT == xmrig::VARIANT_XHV) {
+                idx0 = (~d) ^ q;
+            }
+            else {
+                idx0 = d ^ q;
+            }
+        }
+
+        cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+        ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+        if (BASE == xmrig::VARIANT_2) {
+            if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) {
+                VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+                if (VARIANT == xmrig::VARIANT_4) {
+                    al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+                    ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+                }
+            } else {
+                VARIANT2_INTEGER_MATH(1, cl, cx1);
+            }
+        }
+
+        lo = __umul128(idx1, cl, &hi);
+
+        if (BASE == xmrig::VARIANT_2) {
+            if (VARIANT == xmrig::VARIANT_4) {
+                VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1, 0);
+            } else {
+                VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0));
+            }
+        }
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & MASK])[0] = al1;
+
+        if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) {
+            ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1 ^ al1;
+        } else if (BASE == xmrig::VARIANT_1) {
+            ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1;
+        } else {
+            ((uint64_t*)&l1[idx1 & MASK])[1] = ah1;
+        }
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
+            const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t *>(&l1[idx1 & MASK]));
+            const int64_t n   = vgetq_lane_s64(x, 0);
+            const int32_t d   = vgetq_lane_s32(x, 2);
+            const int64_t q   = n / (d | 0x5);
+
+            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
+
+            if (VARIANT == xmrig::VARIANT_XHV) {
+                idx1 = (~d) ^ q;
+            }
+            else {
+                idx1 = d ^ q;
+            }
+        }
+        if (BASE == xmrig::VARIANT_2) {
+            bx01 = bx00;
+            bx11 = bx10;
+        }
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+    xmrig::keccakf(h0, 24);
+    xmrig::keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
+inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+}
+
+
+template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
+inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+}
+
+
+template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
+inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+}
+
+#endif /* __CRYPTONIGHT_ARM_H__ */
--- a/src/crypto/cn/CryptoNight_constants.h
+++ b/src/crypto/cn/CryptoNight_constants.h
@@ -0,0 +1,251 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_CONSTANTS_H
+#define XMRIG_CRYPTONIGHT_CONSTANTS_H
+
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#include "common/xmrig.h"
+
+
+namespace xmrig
+{
+
+constexpr const size_t   CRYPTONIGHT_MEMORY       = 2 * 1024 * 1024;
+constexpr const uint32_t CRYPTONIGHT_MASK         = 0x1FFFF0;
+constexpr const uint32_t CRYPTONIGHT_ITER         = 0x80000;
+constexpr const uint32_t CRYPTONIGHT_HALF_ITER    = 0x40000;
+constexpr const uint32_t CRYPTONIGHT_XAO_ITER     = 0x100000;
+constexpr const uint32_t CRYPTONIGHT_DOUBLE_ITER  = 0x100000;
+constexpr const uint32_t CRYPTONIGHT_WALTZ_ITER   = 0x60000;
+constexpr const uint32_t CRYPTONIGHT_ZLS_ITER     = 0x60000;
+
+constexpr const uint32_t CRYPTONIGHT_GPU_ITER     = 0xC000;
+constexpr const uint32_t CRYPTONIGHT_GPU_MASK     = 0x1FFFC0;
+
+constexpr const size_t   CRYPTONIGHT_LITE_MEMORY  = 1 * 1024 * 1024;
+constexpr const uint32_t CRYPTONIGHT_LITE_MASK    = 0xFFFF0;
+constexpr const uint32_t CRYPTONIGHT_LITE_ITER    = 0x40000;
+
+constexpr const size_t   CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
+constexpr const uint32_t CRYPTONIGHT_HEAVY_MASK   = 0x3FFFF0;
+constexpr const uint32_t CRYPTONIGHT_HEAVY_ITER   = 0x40000;
+
+constexpr const size_t   CRYPTONIGHT_PICO_MEMORY = 256 * 1024;
+constexpr const uint32_t CRYPTONIGHT_PICO_MASK   = 0x1FFF0;
+constexpr const uint32_t CRYPTONIGHT_PICO_ITER   = 0x40000;
+constexpr const uint32_t CRYPTONIGHT_TRTL_ITER   = 0x10000;
+
+
+template<Algo ALGO> inline constexpr size_t cn_select_memory()           { return 0; }
+template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT>()       { return CRYPTONIGHT_MEMORY; }
+template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT_LITE>()  { return CRYPTONIGHT_LITE_MEMORY; }
+template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT_HEAVY>() { return CRYPTONIGHT_HEAVY_MEMORY; }
+template<> inline constexpr size_t cn_select_memory<CRYPTONIGHT_PICO>()  { return CRYPTONIGHT_PICO_MEMORY; }
+
+
+inline size_t cn_select_memory(Algo algorithm)
+{
+    switch(algorithm)
+    {
+    case CRYPTONIGHT:
+        return CRYPTONIGHT_MEMORY;
+
+    case CRYPTONIGHT_LITE:
+        return CRYPTONIGHT_LITE_MEMORY;
+
+    case CRYPTONIGHT_HEAVY:
+        return CRYPTONIGHT_HEAVY_MEMORY;
+
+    case CRYPTONIGHT_PICO:
+        return CRYPTONIGHT_PICO_MEMORY;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+
+template<Algo ALGO> inline constexpr uint32_t cn_select_mask()           { return 0; }
+template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT>()       { return CRYPTONIGHT_MASK; }
+template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT_LITE>()  { return CRYPTONIGHT_LITE_MASK; }
+template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT_HEAVY>() { return CRYPTONIGHT_HEAVY_MASK; }
+template<> inline constexpr uint32_t cn_select_mask<CRYPTONIGHT_PICO>()  { return CRYPTONIGHT_PICO_MASK; }
+
+
+inline uint32_t cn_select_mask(Algo algorithm)
+{
+    switch(algorithm)
+    {
+    case CRYPTONIGHT:
+        return CRYPTONIGHT_MASK;
+
+    case CRYPTONIGHT_LITE:
+        return CRYPTONIGHT_LITE_MASK;
+
+    case CRYPTONIGHT_HEAVY:
+        return CRYPTONIGHT_HEAVY_MASK;
+
+    case CRYPTONIGHT_PICO:
+        return CRYPTONIGHT_PICO_MASK;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+
+template<Algo ALGO, Variant variant> inline constexpr uint32_t cn_select_iter()        { return 0; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_0>()          { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_1>()          { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_2>()          { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_WOW>()        { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_4>()          { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_XTL>()        { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_HALF>()       { return CRYPTONIGHT_HALF_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_MSR>()        { return CRYPTONIGHT_HALF_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_XAO>()        { return CRYPTONIGHT_XAO_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_RTO>()        { return CRYPTONIGHT_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_GPU>()        { return CRYPTONIGHT_GPU_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_RWZ>()        { return CRYPTONIGHT_WALTZ_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_ZLS>()        { return CRYPTONIGHT_ZLS_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT, VARIANT_DOUBLE>()     { return CRYPTONIGHT_DOUBLE_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_LITE, VARIANT_0>()     { return CRYPTONIGHT_LITE_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_LITE, VARIANT_1>()     { return CRYPTONIGHT_LITE_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_HEAVY, VARIANT_0>()    { return CRYPTONIGHT_HEAVY_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_HEAVY, VARIANT_XHV>()  { return CRYPTONIGHT_HEAVY_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_HEAVY, VARIANT_TUBE>() { return CRYPTONIGHT_HEAVY_ITER; }
+template<> inline constexpr uint32_t cn_select_iter<CRYPTONIGHT_PICO, VARIANT_TRTL>()  { return CRYPTONIGHT_TRTL_ITER; }
+
+
+inline uint32_t cn_select_iter(Algo algorithm, Variant variant)
+{
+    switch (variant) {
+    case VARIANT_MSR:
+    case VARIANT_HALF:
+        return CRYPTONIGHT_HALF_ITER;
+
+    case VARIANT_GPU:
+        return CRYPTONIGHT_GPU_ITER;
+
+    case VARIANT_RTO:
+    case VARIANT_DOUBLE:
+        return CRYPTONIGHT_XAO_ITER;
+
+    case VARIANT_TRTL:
+        return CRYPTONIGHT_TRTL_ITER;
+
+    case VARIANT_RWZ:
+    case VARIANT_ZLS:
+        return CRYPTONIGHT_WALTZ_ITER;
+
+    default:
+        break;
+    }
+
+    switch(algorithm)
+    {
+    case CRYPTONIGHT:
+        return CRYPTONIGHT_ITER;
+
+    case CRYPTONIGHT_LITE:
+        return CRYPTONIGHT_LITE_ITER;
+
+    case CRYPTONIGHT_HEAVY:
+        return CRYPTONIGHT_HEAVY_ITER;
+
+    case CRYPTONIGHT_PICO:
+        return CRYPTONIGHT_TRTL_ITER;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+
+template<Variant variant> inline constexpr Variant cn_base_variant()  { return VARIANT_0; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_0>()      { return VARIANT_0; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_1>()      { return VARIANT_1; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_TUBE>()   { return VARIANT_1; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_XTL>()    { return VARIANT_1; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_MSR>()    { return VARIANT_1; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_XHV>()    { return VARIANT_0; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_XAO>()    { return VARIANT_0; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_RTO>()    { return VARIANT_1; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_2>()      { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_HALF>()   { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_TRTL>()   { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_GPU>()    { return VARIANT_GPU; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_WOW>()    { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_4>()      { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_RWZ>()    { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_ZLS>()    { return VARIANT_2; }
+template<> inline constexpr Variant cn_base_variant<VARIANT_DOUBLE>() { return VARIANT_2; }
+
+
+inline Variant cn_base_variant(Variant variant)
+{
+    switch (variant) {
+    case VARIANT_0:
+    case VARIANT_XHV:
+    case VARIANT_XAO:
+        return VARIANT_0;
+
+    case VARIANT_1:
+    case VARIANT_TUBE:
+    case VARIANT_XTL:
+    case VARIANT_MSR:
+    case VARIANT_RTO:
+        return VARIANT_1;
+
+    case VARIANT_GPU:
+        return VARIANT_GPU;
+
+    default:
+        break;
+    }
+
+    return VARIANT_2;
+}
+
+
+template<Variant variant> inline constexpr bool cn_is_cryptonight_r() { return false; }
+template<> inline constexpr bool cn_is_cryptonight_r<VARIANT_WOW>()   { return true; }
+template<> inline constexpr bool cn_is_cryptonight_r<VARIANT_4>()     { return true; }
+
+} /* namespace xmrig */
+
+
+#endif /* XMRIG_CRYPTONIGHT_CONSTANTS_H */
--- a/src/crypto/cn/CryptoNight_monero.h
+++ b/src/crypto/cn/CryptoNight_monero.h
@@ -0,0 +1,206 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018      SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_MONERO_H
+#define XMRIG_CRYPTONIGHT_MONERO_H
+
+#include <fenv.h>
+#include <math.h>
+
+// VARIANT ALTERATIONS
+#ifndef XMRIG_ARM
+#   define VARIANT1_INIT(part) \
+    uint64_t tweak1_2_##part = 0; \
+    if (BASE == xmrig::VARIANT_1) { \
+        tweak1_2_##part = (*reinterpret_cast<const uint64_t*>(input + 35 + part * size) ^ \
+                          *(reinterpret_cast<const uint64_t*>(ctx[part]->state) + 24)); \
+    }
+#else
+#   define VARIANT1_INIT(part) \
+    uint64_t tweak1_2_##part = 0; \
+    if (BASE == xmrig::VARIANT_1) { \
+        memcpy(&tweak1_2_##part, input + 35 + part * size, sizeof tweak1_2_##part); \
+        tweak1_2_##part ^= *(reinterpret_cast<const uint64_t*>(ctx[part]->state) + 24); \
+    }
+#endif
+
+#define VARIANT1_1(p) \
+    if (BASE == xmrig::VARIANT_1) { \
+        const uint8_t tmp = reinterpret_cast<const uint8_t*>(p)[11]; \
+        static const uint32_t table = 0x75310; \
+        const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
+        ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
+    }
+
+#define VARIANT1_2(p, part) \
+    if (BASE == xmrig::VARIANT_1) { \
+        (p) ^= tweak1_2_##part; \
+    }
+
+
+#ifndef XMRIG_ARM
+#   define VARIANT2_INIT(part) \
+    __m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \
+    __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]);
+
+#ifdef _MSC_VER
+#   define VARIANT2_SET_ROUNDING_MODE() if (BASE == xmrig::VARIANT_2) { _control87(RC_DOWN, MCW_RC); }
+#else
+#   define VARIANT2_SET_ROUNDING_MODE() if (BASE == xmrig::VARIANT_2) { fesetround(FE_DOWNWARD); }
+#endif
+
+#   define VARIANT2_INTEGER_MATH(part, cl, cx) \
+    do { \
+        const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
+        const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
+        cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
+        const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
+        const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+        const uint64_t division_result = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
+        division_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+        sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
+    } while (0)
+
+#   define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c, reverse) \
+    do { \
+        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ (reverse ? 0x30 : 0x10)))); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ (reverse ? 0x10 : 0x30)))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+        if (VARIANT == xmrig::VARIANT_4) { \
+            _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
+        } \
+    } while (0)
+
+#   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo, reverse) \
+    do { \
+        const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
+        lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        if (reverse) { \
+            _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk1, _b1)); \
+            _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk3, _b)); \
+        } else { \
+            _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+            _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        } \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    } while (0)
+
+#else
+#   define VARIANT2_INIT(part) \
+    uint64_t division_result_##part = h##part[12]; \
+    uint64_t sqrt_result_##part = h##part[13];
+
+#   define VARIANT2_INTEGER_MATH(part, cl, cx) \
+    do { \
+        const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
+        cl ^= division_result_##part ^ (sqrt_result_##part << 32); \
+        const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result_##part << 1)) | 0x80000001UL; \
+        const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+        division_result_##part = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
+        const uint64_t sqrt_input = cx_0 + division_result_##part; \
+        sqrt_result_##part = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \
+        const uint64_t s = sqrt_result_##part >> 1; \
+        const uint64_t b = sqrt_result_##part & 1; \
+        const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result_##part << 32); \
+        sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \
+    } while (0)
+
+#   define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c, reverse) \
+    do { \
+        const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ (reverse ? 0x30 : 0x10)))); \
+        const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
+        const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ (reverse ? 0x10 : 0x30)))); \
+        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
+        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
+        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
+        if (VARIANT == xmrig::VARIANT_4) { \
+            _c = veorq_u64(veorq_u64(_c, chunk3), veorq_u64(chunk1, chunk2)); \
+        } \
+    } while (0)
+
+#   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo, reverse) \
+    do { \
+        const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \
+        const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \
+        hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
+        lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
+        const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \
+        if (reverse) { \
+            vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b1))); \
+            vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b))); \
+        } else { \
+            vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
+            vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
+        } \
+        vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
+    } while (0)
+#endif
+
+#define SWAP32LE(x) x
+#define SWAP64LE(x) x
+#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
+
+#ifndef NOINLINE
+#ifdef __GNUC__
+#define NOINLINE __attribute__ ((noinline))
+#elif _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+#endif
+
+#include "common/xmrig.h"
+#include "crypto/cn/r/variant4_random_math.h"
+
+#define VARIANT4_RANDOM_MATH_INIT(part) \
+  uint32_t r##part[9]; \
+  struct V4_Instruction code##part[256]; \
+  if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
+    r##part[0] = (uint32_t)(h##part[12]); \
+    r##part[1] = (uint32_t)(h##part[12] >> 32); \
+    r##part[2] = (uint32_t)(h##part[13]); \
+    r##part[3] = (uint32_t)(h##part[13] >> 32); \
+  } \
+  v4_random_math_init<VARIANT>(code##part, height);
+
+#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
+  if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \
+    cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
+    r##part[4] = static_cast<uint32_t>(al); \
+    r##part[5] = static_cast<uint32_t>(ah); \
+    r##part[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
+    r##part[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
+    r##part[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
+    v4_random_math(code##part, r##part); \
+  }
+
+#endif /* XMRIG_CRYPTONIGHT_MONERO_H */
--- a/src/crypto/cn/CryptoNight_test.h
+++ b/src/crypto/cn/CryptoNight_test.h
@@ -0,0 +1,388 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_TEST_H
+#define XMRIG_CRYPTONIGHT_TEST_H
+
+
+#include <stdint.h>
+
+
+const static uint8_t test_input[380] = {
+    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
+    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
+    0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
+    0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
+    0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
+    0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
+    0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
+    0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
+    0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
+    0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
+    0x07, 0x07, 0xB4, 0x87, 0xD0, 0xD6, 0x05, 0x26, 0xE0, 0xC6, 0xDD, 0x9B, 0xC7, 0x18, 0xC3, 0xCF,
+    0x52, 0x04, 0xBD, 0x4F, 0x9B, 0x27, 0xF6, 0x73, 0xB9, 0x3F, 0xEF, 0x7B, 0xB2, 0xF7, 0x2B, 0xBB,
+    0x3F, 0x3E, 0x9C, 0x3E, 0x9D, 0x33, 0x1E, 0xDE, 0xAD, 0xBE, 0xEF, 0x4E, 0x00, 0x91, 0x81, 0x29,
+    0x74, 0xB2, 0x70, 0xE7, 0x6D, 0xD2, 0x2A, 0x5F, 0x52, 0x04, 0x93, 0xE6, 0x18, 0x89, 0x40, 0xD8,
+    0xC6, 0xE3, 0x90, 0x6E, 0xAA, 0x6A, 0xB7, 0xE2, 0x08, 0x7E, 0x78, 0x0E,
+    0x01, 0x00, 0xEE, 0xB2, 0xD1, 0xD6, 0x05, 0xFF, 0x27, 0x7F, 0x26, 0xDB, 0xAA, 0xB2, 0xC9, 0x26,
+    0x30, 0xC6, 0xCF, 0x11, 0x64, 0xEA, 0x6C, 0x8A, 0xE0, 0x98, 0x01, 0xF8, 0x75, 0x4B, 0x49, 0xAF,
+    0x79, 0x70, 0xAE, 0xEE, 0xA7, 0x62, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x47, 0x8C, 0x63, 0xE7, 0xD8,
+    0x40, 0x02, 0x3C, 0xDA, 0xEA, 0x92, 0x52, 0x53, 0xAC, 0xFD, 0xC7, 0x8A, 0x4C, 0x31, 0xB2, 0xF2,
+    0xEC, 0x72, 0x7B, 0xFF, 0xCE, 0xC0, 0xE7, 0x12, 0xD4, 0xE9, 0x2A, 0x01,
+    0x07, 0x07, 0xA9, 0xB7, 0xD1, 0xD6, 0x05, 0x3F, 0x0D, 0x5E, 0xFD, 0xC7, 0x03, 0xFC, 0xFC, 0xD2,
+    0xCE, 0xBC, 0x44, 0xD8, 0xAB, 0x44, 0xA6, 0xA0, 0x3A, 0xE4, 0x4D, 0x8F, 0x15, 0xAF, 0x62, 0x17,
+    0xD1, 0xE0, 0x92, 0x85, 0xE4, 0x73, 0xF9, 0x00, 0x00, 0x00, 0xA0, 0xFC, 0x09, 0xDE, 0xAB, 0xF5,
+    0x8B, 0x6F, 0x1D, 0xCA, 0xA8, 0xBA, 0xAC, 0x74, 0xDD, 0x74, 0x19, 0xD5, 0xD6, 0x10, 0xEC, 0x38,
+    0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04
+};
+
+
+struct cn_r_test_input_data
+{
+    uint64_t height;
+    size_t size;
+    uint8_t data[64];
+};
+
+
+const static cn_r_test_input_data cn_r_test_input[] = {
+    { 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } },
+    { 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } },
+    { 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } },
+    { 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } },
+    { 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } },
+    { 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } },
+    { 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } },
+    { 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } },
+    { 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } },
+    { 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } },
+};
+
+
+// "cn/wow"
+const static uint8_t test_output_wow[] = {
+    0x9d, 0x47, 0xbf, 0x4c, 0x41, 0xb7, 0xe8, 0xe7, 0x27, 0xe6, 0x81, 0x71, 0x5a, 0xcb, 0x47, 0xfa, 0x16, 0x77, 0xcd, 0xba, 0x9c, 0xa7, 0xbc, 0xb0, 0x5a, 0xd8, 0xcc, 0x8a, 0xbd, 0x5d, 0xaa, 0x66,
+    0x0d, 0x4a, 0x49, 0x5c, 0xb8, 0x44, 0xa3, 0xca, 0x8b, 0xa4, 0xed, 0xb8, 0xe6, 0xbc, 0xf8, 0x29, 0xef, 0x1c, 0x06, 0xd9, 0xcd, 0xea, 0x2b, 0x62, 0xca, 0x46, 0xc2, 0xa2, 0x1b, 0x8b, 0x0a, 0x79,
+    0xa1, 0xd6, 0xd8, 0x48, 0xb5, 0xc5, 0x91, 0x5f, 0xcc, 0xd2, 0xf6, 0x4c, 0xf2, 0x16, 0xc6, 0xb1, 0xa0, 0x2c, 0xf7, 0xc7, 0x7b, 0xc8, 0x0d, 0x8d, 0x4e, 0x51, 0xb4, 0x19, 0xe8, 0x8f, 0xf0, 0xdd,
+    0xaf, 0x3a, 0x85, 0x44, 0xa0, 0x22, 0x1a, 0x14, 0x8c, 0x2a, 0xc9, 0x04, 0x84, 0xb1, 0x98, 0x61, 0xe3, 0xaf, 0xca, 0x33, 0xfe, 0x17, 0x02, 0x1e, 0xfb, 0x8a, 0xd6, 0x49, 0x6b, 0x56, 0x79, 0x15,
+    0x31, 0x33, 0x99, 0xe0, 0x96, 0x3a, 0xe8, 0xa9, 0x9d, 0xab, 0x8a, 0xf6, 0x6d, 0x34, 0x3e, 0x09, 0x7d, 0xae, 0x0c, 0x0f, 0xeb, 0x08, 0xdb, 0xc4, 0x3c, 0xcd, 0xaf, 0xef, 0x55, 0x15, 0xf4, 0x13,
+    0x60, 0x21, 0xc6, 0xef, 0x90, 0xbf, 0xf9, 0xae, 0x94, 0xa7, 0x50, 0x6d, 0x62, 0x3d, 0x3a, 0x7a, 0x86, 0xc1, 0x75, 0x6d, 0x65, 0x5f, 0x50, 0xdd, 0x55, 0x8f, 0x71, 0x6d, 0x64, 0x62, 0x2a, 0x34,
+    0x2b, 0x13, 0x00, 0x05, 0x35, 0xf3, 0xdb, 0x5f, 0x9b, 0x9b, 0x84, 0xa6, 0x5c, 0x43, 0x51, 0xf3, 0x86, 0xcd, 0x2c, 0xde, 0xde, 0xbb, 0x8c, 0x3a, 0xd2, 0xea, 0xb0, 0x86, 0xe6, 0xa3, 0xfe, 0xe5,
+    0xfc, 0x0e, 0x1d, 0xad, 0x8e, 0x89, 0x57, 0x49, 0xdc, 0x90, 0xeb, 0x69, 0x0b, 0xc1, 0xba, 0x05, 0x9a, 0x1c, 0xd7, 0x72, 0xaf, 0xaa, 0xf6, 0x5a, 0x10, 0x6b, 0xf9, 0xe5, 0xe6, 0xb8, 0x05, 0x03,
+    0xb6, 0x0b, 0x0a, 0xfe, 0x14, 0x4d, 0xef, 0xf7, 0xd9, 0x03, 0xed, 0x2d, 0x55, 0x45, 0xe7, 0x7e, 0xbe, 0x66, 0xa3, 0xc5, 0x1f, 0xee, 0x70, 0x16, 0xee, 0xb8, 0xfe, 0xe9, 0xeb, 0x63, 0x0c, 0x0f,
+    0x64, 0x77, 0x4b, 0x27, 0xe7, 0xd5, 0xfe, 0xc8, 0x62, 0xfc, 0x4c, 0x0c, 0x13, 0xac, 0x6b, 0xf0, 0x91, 0x23, 0xb6, 0xf0, 0x5b, 0xb0, 0xe4, 0xb7, 0x5c, 0x97, 0xf3, 0x79, 0xa2, 0xb3, 0xa6, 0x79,
+};
+
+
+// "cn/r"
+const static uint8_t test_output_r[] = {
+    0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc,
+    0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66,
+    0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab,
+    0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb,
+    0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02,
+    0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef,
+    0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd,
+    0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4,
+    0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3,
+    0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e,
+};
+
+
+// "cn/0"
+const static uint8_t test_output_v0[160] = {
+    0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
+    0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
+    0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
+    0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F,
+    0xA1, 0xB4, 0xFA, 0xE3, 0xE5, 0x76, 0xCE, 0xCF, 0xB7, 0x9C, 0xAF, 0x3E, 0x29, 0x92, 0xE4, 0xE0,
+    0x31, 0x24, 0x05, 0x48, 0xBF, 0x8D, 0x5F, 0x7B, 0x11, 0x03, 0x60, 0xAA, 0xD7, 0x50, 0x3F, 0x0C,
+    0x2D, 0x30, 0xF3, 0x87, 0x4F, 0x86, 0xA1, 0x4A, 0xB5, 0xA2, 0x1A, 0x08, 0xD0, 0x44, 0x2C, 0x9D,
+    0x16, 0xE9, 0x28, 0x49, 0xA1, 0xFF, 0x85, 0x6F, 0x12, 0xBB, 0x7D, 0xAB, 0x11, 0x1C, 0xE7, 0xF7,
+    0x2D, 0x9D, 0x19, 0xE4, 0xD2, 0x26, 0x44, 0x1E, 0xCD, 0x22, 0x08, 0x24, 0xA8, 0x97, 0x46, 0x62,
+    0x04, 0x84, 0x90, 0x4A, 0xEE, 0x99, 0x14, 0xED, 0xB8, 0xC6, 0x0D, 0x37, 0xA1, 0x66, 0x17, 0xB0
+};
+
+
+// "cn/1" Cryptonight variant 1 (Monero v7)
+const static uint8_t test_output_v1[160] = {
+    0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
+    0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
+    0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
+    0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
+    0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98,
+    0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C,
+    0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE,
+    0x92, 0x2C, 0xD6, 0x0C, 0x46, 0x9D, 0x9B, 0xC2, 0x84, 0x52, 0x65, 0xF6, 0xBD, 0xFA, 0x0D, 0x74,
+    0x00, 0x66, 0x10, 0x07, 0xF1, 0x19, 0x06, 0x3A, 0x6C, 0xFF, 0xEE, 0xB2, 0x40, 0xE5, 0x88, 0x2B,
+    0x6C, 0xAB, 0x6B, 0x1D, 0x88, 0xB8, 0x44, 0x25, 0xF4, 0xEA, 0xB7, 0xEC, 0xBA, 0x12, 0x8A, 0x24
+};
+
+
+// "cn/2" Cryptonight variant 2 (Monero v8)
+const static uint8_t test_output_v2[160] = {
+    0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14,
+    0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21,
+    0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89,
+    0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78,
+    0xE6, 0x0D, 0x24, 0x0F, 0x65, 0x85, 0x60, 0x3A, 0x4A, 0xE5, 0x5F, 0x54, 0x9B, 0xC8, 0x79, 0x93,
+    0xEB, 0x3D, 0x98, 0x2C, 0xFE, 0x9B, 0xFB, 0x15, 0xB6, 0x88, 0x21, 0x94, 0xB0, 0x05, 0x86, 0x5C,
+    0x59, 0x8B, 0x93, 0x7A, 0xDA, 0xD2, 0xA2, 0x14, 0xED, 0xB7, 0xC4, 0x5D, 0xA1, 0xEF, 0x26, 0xF3,
+    0xC7, 0x73, 0x29, 0x4D, 0xF1, 0xC8, 0x2C, 0xE0, 0xD0, 0xE9, 0xED, 0x0C, 0x70, 0x75, 0x05, 0x3E,
+    0x5B, 0xF6, 0xA0, 0x6E, 0xEA, 0xDE, 0x87, 0x0B, 0x06, 0x29, 0x03, 0xBF, 0xB4, 0x85, 0x9D, 0x04,
+    0x75, 0x1A, 0xCD, 0x1E, 0xD6, 0xAA, 0x1B, 0x05, 0x24, 0x6A, 0x2C, 0x80, 0x69, 0x68, 0xDC, 0x97
+};
+
+
+// "cn/xtl" Stellite (XTL)
+const static uint8_t test_output_xtl[160] = {
+    0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3,
+    0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1,
+    0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90,
+    0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF,
+    0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3,
+    0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86,
+    0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D,
+    0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF,
+    0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA,
+    0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48
+};
+
+
+// "cn/half"
+const static uint8_t test_output_half[160] = {
+    0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD,
+    0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7,
+    0x02, 0xE6, 0x1D, 0x2B, 0xBC, 0x84, 0xB6, 0x71, 0x96, 0x71, 0xD5, 0x0C, 0xAC, 0x76, 0x0E, 0x6B,
+    0xF1, 0xF0, 0x55, 0x34, 0x15, 0x29, 0x93, 0x04, 0x2D, 0xED, 0xD2, 0x33, 0x50, 0x6E, 0xBE, 0x25,
+    0xD0, 0xFD, 0x8E, 0xC6, 0x15, 0xD5, 0x12, 0x53, 0x7B, 0x26, 0xF6, 0x01, 0xA5, 0xA8, 0xBE, 0x7C,
+    0xCF, 0x5E, 0x19, 0xB7, 0x63, 0x0D, 0x0F, 0x02, 0x2B, 0xD7, 0xC4, 0x8C, 0x12, 0x24, 0x80, 0x02,
+    0xE7, 0xB7, 0xA0, 0x4F, 0x94, 0xF9, 0x46, 0xB5, 0x18, 0x64, 0x7E, 0x4E, 0x9C, 0x81, 0x6C, 0x60,
+    0x7D, 0x2E, 0xEA, 0xCF, 0x90, 0xCB, 0x68, 0x09, 0xC9, 0x53, 0xF6, 0xA9, 0xCA, 0x0C, 0xAC, 0xDC,
+    0xFD, 0x07, 0xDA, 0x24, 0x1D, 0xD1, 0x35, 0x32, 0x3C, 0xE8, 0x64, 0x44, 0x5E, 0xCB, 0xB5, 0x00,
+    0x69, 0xF4, 0x6F, 0xBB, 0x62, 0x0D, 0x25, 0xD8, 0xAC, 0x20, 0x90, 0xC5, 0x1B, 0xD3, 0x5F, 0xCA
+};
+
+
+// "cn/msr" Masari (MSR)
+const static uint8_t test_output_msr[160] = {
+    0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C,
+    0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5,
+    0x69, 0xDF, 0x38, 0x51, 0x1B, 0xB3, 0xEB, 0x7D, 0xE7, 0x6B, 0x08, 0x8E, 0xB6, 0x7E, 0xB7, 0x1C,
+    0x5F, 0x3C, 0x81, 0xC9, 0xF7, 0xCE, 0xAE, 0x28, 0xC0, 0xFE, 0xEB, 0xBA, 0x0B, 0x40, 0x38, 0x1D,
+    0x44, 0xD0, 0xD5, 0xD3, 0x98, 0x1F, 0xA3, 0x0E, 0xE9, 0x89, 0x1A, 0xD7, 0x88, 0xCC, 0x25, 0x76,
+    0x9C, 0xFF, 0x4D, 0x7F, 0x9C, 0xCF, 0x48, 0x07, 0x91, 0xF9, 0x82, 0xF5, 0x4C, 0xE9, 0xBD, 0x82,
+    0x36, 0x36, 0x64, 0x14, 0xED, 0xB8, 0x54, 0xEE, 0x22, 0xA1, 0x66, 0xA3, 0x87, 0x10, 0x76, 0x1F,
+    0x5A, 0xCD, 0x4C, 0x31, 0x4C, 0xBA, 0x41, 0xD2, 0xDB, 0x6C, 0x31, 0x2E, 0x7A, 0x64, 0x15, 0xFF,
+    0xA6, 0xD9, 0xB9, 0x7D, 0x1C, 0x3C, 0x98, 0xDD, 0x16, 0xE6, 0xD3, 0xAA, 0xEF, 0xB6, 0xB3, 0x53,
+    0x74, 0xD1, 0xAC, 0x5C, 0x04, 0x26, 0x7D, 0x71, 0xDE, 0xAB, 0x66, 0x28, 0x91, 0x3A, 0x6F, 0x4F
+};
+
+
+// "cn/xao" Alloy (XAO)
+const static uint8_t test_output_xao[160] = {
+    0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C,
+    0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33,
+    0xF1, 0xC9, 0x98, 0x40, 0x42, 0xDE, 0x39, 0xD1, 0xBA, 0x2D, 0xAD, 0xEC, 0xFE, 0xEA, 0xD8, 0x46,
+    0x56, 0x1C, 0x32, 0x90, 0x42, 0x63, 0x10, 0x80, 0xD7, 0x01, 0xE4, 0xE6, 0x20, 0xB3, 0x60, 0x45,
+    0x05, 0xE5, 0xC2, 0x18, 0xCD, 0x07, 0xA4, 0x40, 0x42, 0x91, 0xE2, 0xA4, 0x52, 0x54, 0x79, 0xBA,
+    0xCD, 0x7E, 0x61, 0x2D, 0x7F, 0x7E, 0x69, 0x5E, 0xD7, 0xC0, 0x06, 0x65, 0xD7, 0xA1, 0xB8, 0xB8,
+    0x1E, 0x31, 0x1C, 0xD3, 0xB7, 0xBC, 0x78, 0x3C, 0x01, 0xAF, 0x77, 0xAA, 0xF3, 0x0F, 0x4C, 0xF2,
+    0xD1, 0x8B, 0x58, 0xC7, 0xEB, 0x99, 0x91, 0x53, 0x43, 0x71, 0x47, 0x99, 0x9E, 0x04, 0xA4, 0xEA,
+    0xB8, 0xA3, 0xB0, 0x9E, 0x09, 0xF5, 0x57, 0x5C, 0xCF, 0x8A, 0xC6, 0xCA, 0x88, 0x51, 0x9A, 0x01,
+    0x31, 0xCC, 0x0C, 0xA6, 0x53, 0xB5, 0x5F, 0xFD, 0x7D, 0x29, 0x3A, 0x35, 0xE9, 0x0E, 0x25, 0x6C
+};
+
+
+// "cn/rto" Arto (RTO)
+const static uint8_t test_output_rto[160] = {
+    0x82, 0x66, 0x1E, 0x1C, 0x6E, 0x64, 0x36, 0x66, 0x84, 0x06, 0x32, 0x7A, 0x9B, 0xB1, 0x13, 0x19,
+    0xA5, 0x56, 0x16, 0x15, 0xDF, 0xEC, 0x1C, 0x9E, 0xE3, 0x88, 0x4A, 0x6C, 0x1C, 0xEB, 0x76, 0xA5,
+    0xB3, 0xFB, 0xF4, 0x3F, 0x2B, 0x6A, 0x3A, 0x39, 0xA3, 0x6E, 0x08, 0x33, 0x67, 0x90, 0x31, 0xB9,
+    0x3F, 0x27, 0xE4, 0x79, 0x32, 0x61, 0x6B, 0x5C, 0x8A, 0xF8, 0xAF, 0xC0, 0x60, 0xFD, 0x83, 0xB7,
+    0x11, 0x11, 0x89, 0xB4, 0xDC, 0xAE, 0x40, 0xC8, 0x64, 0xAA, 0x4D, 0x19, 0x23, 0x7B, 0xD3, 0x27,
+    0xB2, 0x0F, 0xA7, 0x50, 0x7D, 0xCA, 0xF5, 0x03, 0x06, 0xB2, 0x26, 0x62, 0xF3, 0x68, 0x2D, 0x30,
+    0x6F, 0x93, 0x1E, 0xFF, 0xCD, 0x85, 0x40, 0x28, 0x5F, 0xC3, 0x8C, 0x76, 0x51, 0x9E, 0xD5, 0x06,
+    0x32, 0xD6, 0x35, 0x83, 0xF6, 0x3B, 0x54, 0x4F, 0xA1, 0x9C, 0x13, 0xD8, 0xC4, 0x0E, 0x01, 0x2F,
+    0x29, 0xDB, 0x8C, 0x1C, 0xB7, 0x06, 0x86, 0x79, 0x6D, 0xFF, 0x9F, 0x89, 0x3B, 0x3A, 0xA5, 0x79,
+    0xE7, 0x81, 0x4E, 0x2A, 0xBD, 0x62, 0xC1, 0x1B, 0x7C, 0xB9, 0x33, 0x7B, 0xEE, 0x95, 0x80, 0xB3
+};
+
+// "cn/rwz"
+const static uint8_t test_output_rwz[160] = {
+    0x5f, 0x56, 0xc6, 0xb0, 0x99, 0x6b, 0xa2, 0x3e, 0x0b, 0xba, 0x07, 0x29, 0xc9, 0x90, 0x74, 0x85,
+    0x5a, 0x10, 0xe3, 0x08, 0x7f, 0xdb, 0xfe, 0x94, 0x75, 0x33, 0x54, 0x73, 0x76, 0xf0, 0x75, 0xb8,
+    0x8b, 0x70, 0x43, 0x9a, 0xfc, 0xf5, 0xeb, 0x15, 0xbb, 0xf9, 0xad, 0x9d, 0x2a, 0xbd, 0x72, 0x52,
+    0x49, 0x54, 0x0b, 0x91, 0xea, 0x61, 0x7f, 0x98, 0x7d, 0x39, 0x17, 0xb7, 0xd7, 0x65, 0xff, 0x75,
+    0x13, 0x21, 0x1d, 0xce, 0x61, 0x5a, 0xdc, 0x5f, 0x8c, 0xcb, 0x1f, 0x6f, 0xbb, 0x92, 0x88, 0xc3,
+    0xe3, 0xe2, 0xfc, 0x4f, 0x62, 0xfb, 0xf0, 0x48, 0x02, 0x01, 0xd3, 0xbe, 0x77, 0x6a, 0x40, 0xca,
+    0x9a, 0xe9, 0xba, 0x0c, 0xc0, 0x2b, 0x11, 0xf6, 0x9b, 0xee, 0x24, 0x3a, 0xd8, 0x86, 0x18, 0xd0,
+    0xe8, 0xeb, 0xcb, 0x38, 0x2c, 0xf5, 0x99, 0x83, 0x14, 0x7b, 0x0c, 0x20, 0xbe, 0x50, 0xf4, 0x87,
+    0x83, 0x41, 0x75, 0xd8, 0xd1, 0xdd, 0x4b, 0x73, 0xb3, 0x92, 0x8f, 0xe6, 0x1c, 0x72, 0x70, 0xf5,
+    0x7c, 0xf6, 0x23, 0x3a, 0xb4, 0x5f, 0xdf, 0xde, 0xa6, 0x5a, 0x58, 0xec, 0x13, 0x5a, 0x23, 0x2f
+};
+
+// "cn/zls"
+const static uint8_t test_output_zls[160] = {
+    0x51, 0x6E, 0x33, 0xC6, 0xE4, 0x46, 0xAB, 0xBC, 0xCD, 0xAD, 0x18, 0xC0, 0x4C, 0xD9, 0xA2, 0x5E,
+    0x64, 0x10, 0x28, 0x53, 0xB2, 0x0A, 0x42, 0xDF, 0xDE, 0xAA, 0x8B, 0x59, 0x9E, 0xCF, 0x40, 0xE2,
+    0x0D, 0x62, 0x5B, 0x42, 0x18, 0xE2, 0x76, 0xAD, 0xD0, 0x74, 0x90, 0x60, 0x8D, 0xC4, 0xC7, 0x80,
+    0x17, 0xB5, 0x1B, 0x25, 0x31, 0x39, 0x87, 0xD2, 0x2D, 0x6A, 0x9D, 0x1C, 0x74, 0xF4, 0x43, 0x22,
+    0x4B, 0x97, 0x1F, 0x6A, 0xD0, 0xBE, 0x00, 0x74, 0xEC, 0xC5, 0xD8, 0x3B, 0xE6, 0xF4, 0x03, 0x8A,
+    0x7B, 0xBA, 0x80, 0xCC, 0x9F, 0x00, 0xCB, 0xC2, 0x14, 0x8F, 0xF3, 0xD8, 0x92, 0x73, 0xBF, 0x17,
+    0x3D, 0x9B, 0x22, 0xA3, 0x61, 0x94, 0x41, 0x9E, 0xF9, 0x68, 0x1D, 0x42, 0x48, 0x3B, 0x39, 0x45,
+    0xE2, 0xE6, 0x16, 0x84, 0xFC, 0x21, 0xE6, 0xDA, 0x38, 0x7F, 0x17, 0xAB, 0xD3, 0xF2, 0xCE, 0x1A,
+    0x2F, 0x35, 0xD5, 0x74, 0xFA, 0x45, 0x3B, 0x06, 0xD1, 0x4E, 0x84, 0x3A, 0x5D, 0xE3, 0x0E, 0xA5,
+    0x00, 0x08, 0x64, 0xF0, 0xA6, 0xC8, 0x94, 0x45, 0x08, 0xED, 0x03, 0x95, 0x52, 0xE9, 0xBC, 0x5F
+};
+
+// "cn/double"
+const static uint8_t test_output_double[160] = {
+    0xAE, 0xFB, 0xB3, 0xF0, 0xCC, 0x88, 0x04, 0x6D, 0x11, 0x9F, 0x6C, 0x54, 0xB9, 0x6D, 0x90, 0xC9,
+    0xE8, 0x84, 0xEA, 0x3B, 0x59, 0x83, 0xA6, 0x0D, 0x50, 0xA4, 0x2D, 0x7D, 0x3E, 0xBE, 0x48, 0x21,
+    0x49, 0xCE, 0x8E, 0xF3, 0xBC, 0x8A, 0x36, 0xBF, 0x86, 0x37, 0x89, 0x55, 0x09, 0xBA, 0x22, 0xF8,
+    0xEB, 0x3A, 0xE1, 0xDC, 0x91, 0xF7, 0x62, 0x4B, 0x9F, 0x48, 0xE6, 0x92, 0xBD, 0xE4, 0x5D, 0xC1,
+    0xF1, 0x3C, 0x63, 0x1D, 0xEB, 0x0B, 0x04, 0xA3, 0x30, 0xD5, 0x11, 0x15, 0x4C, 0xCE, 0xEF, 0x4F,
+    0xDF, 0x69, 0xE3, 0x9E, 0xD2, 0x68, 0xFC, 0x1B, 0x6F, 0xE8, 0x08, 0x9C, 0xBB, 0xA5, 0x2B, 0x60,
+    0x52, 0x0F, 0xE5, 0xD2, 0xF3, 0x8A, 0xB3, 0xE1, 0x76, 0x7F, 0x44, 0x25, 0x76, 0xEC, 0xFF, 0xA2,
+    0x0C, 0x64, 0xD0, 0x0E, 0x32, 0x33, 0x28, 0x20, 0x73, 0xE0, 0x31, 0x66, 0x4E, 0x54, 0x83, 0x49,
+    0x51, 0x55, 0x4D, 0x2E, 0x22, 0xB7, 0x51, 0x09, 0x73, 0x61, 0x7E, 0x6A, 0x57, 0x0B, 0x28, 0x3C,
+    0x5E, 0x2E, 0xC1, 0x80, 0x89, 0x39, 0xB3, 0x54, 0x39, 0x52, 0x0E, 0x69, 0x3D, 0xF6, 0xC5, 0x4A
+};
+
+#ifndef XMRIG_NO_AEON
+// "cn-lite/0"
+const static uint8_t test_output_v0_lite[160] = {
+    0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
+    0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
+    0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
+    0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD,
+    0x38, 0x08, 0xE1, 0x17, 0x0B, 0x99, 0x8D, 0x1A, 0x3C, 0xCE, 0x35, 0xC5, 0xC7, 0x3A, 0x00, 0x2E,
+    0xCB, 0x54, 0xF0, 0x78, 0x2E, 0x9E, 0xDB, 0xC7, 0xDF, 0x2E, 0x71, 0x9A, 0x16, 0x97, 0xC4, 0x18,
+    0x4B, 0x97, 0x07, 0xFE, 0x5D, 0x98, 0x9A, 0xD6, 0xD8, 0xE5, 0x92, 0x66, 0x87, 0x7F, 0x19, 0x37,
+    0xA2, 0x5E, 0xE6, 0x96, 0xB5, 0x97, 0x33, 0x89, 0xE0, 0xA7, 0xC9, 0xDD, 0x4A, 0x7E, 0x9E, 0x53,
+    0xBE, 0x91, 0x2B, 0xF5, 0xF5, 0xAF, 0xDD, 0x09, 0xA2, 0xF4, 0xA4, 0x56, 0xEB, 0x96, 0x22, 0xC9,
+    0x94, 0xFB, 0x7B, 0x28, 0xC9, 0x97, 0x65, 0x04, 0xAC, 0x4F, 0x84, 0x71, 0xDA, 0x6E, 0xD8, 0xC5
+};
+
+
+// "cn-lite/1" AEON v7
+const static uint8_t test_output_v1_lite[160] = {
+    0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
+    0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
+    0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
+    0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F,
+    0x16, 0x08, 0x74, 0xC7, 0xA2, 0xD2, 0xA3, 0x97, 0x95, 0x76, 0xCA, 0x4D, 0x06, 0x39, 0x7A, 0xAB,
+    0x6C, 0x87, 0x58, 0x33, 0x4D, 0xC8, 0x5A, 0xAB, 0x04, 0x27, 0xFE, 0x8B, 0x1C, 0x23, 0x2F, 0x32,
+    0xC0, 0x44, 0xFF, 0x0D, 0xB5, 0x3B, 0x27, 0x96, 0x06, 0x89, 0x7B, 0xA3, 0x0B, 0xD0, 0xCE, 0x9E,
+    0x90, 0x22, 0x77, 0x5A, 0xAD, 0xA1, 0xE5, 0xB6, 0xFC, 0xCB, 0x39, 0x7E, 0x2B, 0x10, 0xEE, 0xB4,
+    0x8C, 0x2B, 0xA4, 0x1F, 0x60, 0x76, 0x39, 0xD7, 0xF6, 0x46, 0x77, 0x18, 0x20, 0xAD, 0xD4, 0xC9,
+    0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6
+};
+#endif
+
+
+#ifndef XMRIG_NO_SUMO
+// "cn-heavy/0"
+const static uint8_t test_output_v0_heavy[160] = {
+    0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
+    0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2,
+    0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A,
+    0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D,
+    0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7,
+    0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD,
+    0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1,
+    0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9,
+    0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51,
+    0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB
+};
+
+
+// "cn-heavy/xhv"
+const static uint8_t test_output_xhv_heavy[160] = {
+    0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57,
+    0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6,
+    0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F,
+    0xDE, 0x18, 0x29, 0x8E, 0xBB, 0x34, 0x2B, 0xEF, 0x7A, 0x04, 0x22, 0xD1, 0xB1, 0xF2, 0x48, 0xDA,
+    0xE3, 0x7F, 0x4B, 0x4C, 0xB4, 0xDF, 0xE8, 0xD3, 0x70, 0xE2, 0xE7, 0x44, 0x25, 0x87, 0x12, 0xF9,
+    0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B,
+    0xED, 0x2F, 0xE6, 0x0F, 0x39, 0x05, 0xB4, 0x4A, 0x8F, 0x38, 0x44, 0x2D, 0x4B, 0xE9, 0x7B, 0x81,
+    0xC6, 0xB0, 0xE0, 0x0A, 0x39, 0x8C, 0x38, 0xFE, 0x63, 0x31, 0x47, 0x65, 0x0D, 0x2B, 0xF4, 0x96,
+    0x13, 0x91, 0x89, 0xB4, 0x5B, 0xA9, 0x2A, 0x7A, 0x09, 0x65, 0x14, 0x20, 0x76, 0x24, 0x6C, 0x80,
+    0x1D, 0x3F, 0x9F, 0xCD, 0x68, 0x39, 0xA9, 0x42, 0x27, 0xC1, 0x0C, 0x53, 0x98, 0x35, 0x60, 0x7A
+};
+
+
+// "cn-heavy/tube"
+const static uint8_t test_output_tube_heavy[160] = {
+    0xFE, 0x53, 0x35, 0x20, 0x76, 0xEA, 0xE6, 0x89, 0xFA, 0x3B, 0x4F, 0xDA, 0x61, 0x46, 0x34, 0xCF,
+    0xC3, 0x12, 0xEE, 0x0C, 0x38, 0x7D, 0xF2, 0xB8, 0xB7, 0x4D, 0xA2, 0xA1, 0x59, 0x74, 0x12, 0x35,
+    0xCD, 0x3F, 0x29, 0xDF, 0x07, 0x4A, 0x14, 0xAD, 0x0B, 0x98, 0x99, 0x37, 0xCA, 0x14, 0x68, 0xA3,
+    0x8D, 0xAE, 0x86, 0xC1, 0xA3, 0x54, 0x05, 0xBE, 0xEA, 0x6D, 0x29, 0x24, 0x0C, 0x82, 0x97, 0x74,
+    0xA0, 0x64, 0x77, 0xCD, 0x8D, 0x8A, 0xC3, 0x10, 0xB4, 0x89, 0x0E, 0xBB, 0x7D, 0xE6, 0x32, 0x8F,
+    0xF4, 0x2D, 0xB6, 0x9E, 0x8A, 0xF9, 0xF8, 0xEE, 0x2C, 0xD0, 0x74, 0xED, 0xA9, 0xAA, 0xA1, 0xFB,
+    0xE2, 0xC9, 0x89, 0x66, 0xD6, 0x66, 0x52, 0xA2, 0x16, 0xDA, 0x36, 0xA0, 0x10, 0x62, 0xD2, 0xB1,
+    0x76, 0xD1, 0x31, 0xE9, 0x1C, 0x08, 0xB6, 0xCA, 0xAF, 0x89, 0xB9, 0x3D, 0x2C, 0xFA, 0x9A, 0x30,
+    0x74, 0x6A, 0x96, 0xA1, 0x95, 0x6C, 0xBB, 0x46, 0x4D, 0xE0, 0xEB, 0x28, 0xBE, 0x2A, 0x8C, 0x34,
+    0x57, 0x79, 0xBE, 0x52, 0xFB, 0xBC, 0x68, 0x43, 0x45, 0xF4, 0xDF, 0xA5, 0xA8, 0xFD, 0x55, 0xA6
+};
+#endif
+
+
+#ifndef XMRIG_NO_CN_PICO
+// "cn-pico/trtl"
+const static uint8_t test_output_pico_trtl[160] = {
+    0x08, 0xF4, 0x21, 0xD7, 0x83, 0x31, 0x17, 0x30, 0x0E, 0xDA, 0x66, 0xE9, 0x8F, 0x4A, 0x25, 0x69,
+    0x09, 0x3D, 0xF3, 0x00, 0x50, 0x01, 0x73, 0x94, 0x4E, 0xFC, 0x40, 0x1E, 0x9A, 0x4A, 0x17, 0xAF,
+    0xB2, 0x17, 0x2E, 0xC9, 0x46, 0x6E, 0x1A, 0xEE, 0x70, 0xEC, 0x85, 0x72, 0xA1, 0x4C, 0x23, 0x3E,
+    0xE3, 0x54, 0x58, 0x2B, 0xCB, 0x93, 0xF8, 0x69, 0xD4, 0x29, 0x74, 0x4D, 0xE5, 0x72, 0x6A, 0x26,
+    0x4E, 0xFD, 0x28, 0xFC, 0xD3, 0x74, 0x8A, 0x83, 0xF3, 0xCA, 0x92, 0x84, 0xE7, 0x4E, 0x10, 0xC2,
+    0x05, 0x62, 0xC7, 0xBE, 0x99, 0x73, 0xED, 0x90, 0xB5, 0x6F, 0xDA, 0x64, 0x71, 0x2D, 0x99, 0x39,
+    0x29, 0xDB, 0x22, 0x2B, 0x97, 0xB6, 0x37, 0x0E, 0x9A, 0x03, 0x65, 0xCC, 0xF7, 0xD0, 0x9A, 0xB7,
+    0x68, 0xCE, 0x07, 0x3E, 0x15, 0x40, 0x3C, 0xCE, 0x8C, 0x63, 0x16, 0x72, 0xB5, 0x74, 0x84, 0xF4,
+    0xA1, 0xE7, 0x53, 0x85, 0xFB, 0x72, 0xDD, 0x75, 0x90, 0x39, 0xB2, 0x3D, 0xC3, 0x08, 0x2C, 0xD5,
+    0x01, 0x08, 0x27, 0x75, 0x86, 0xB9, 0xBB, 0x9B, 0xDF, 0xEA, 0x49, 0xDE, 0x46, 0xCB, 0x83, 0x45
+};
+#endif
+
+
+#ifndef XMRIG_NO_CN_GPU
+// "cn/gpu"
+const static uint8_t test_output_gpu[160] = {
+    0xE5, 0x5C, 0xB2, 0x3E, 0x51, 0x64, 0x9A, 0x59, 0xB1, 0x27, 0xB9, 0x6B, 0x51, 0x5F, 0x2B, 0xF7,
+    0xBF, 0xEA, 0x19, 0x97, 0x41, 0xA0, 0x21, 0x6C, 0xF8, 0x38, 0xDE, 0xD0, 0x6E, 0xFF, 0x82, 0xDF,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+#endif
+
+
+#endif /* XMRIG_CRYPTONIGHT_TEST_H */
--- a/src/crypto/cn/CryptoNight_x86.h
+++ b/src/crypto/cn/CryptoNight_x86.h
--- a/src/crypto/cn/SSE2NEON.h
+++ b/src/crypto/cn/SSE2NEON.h
--- a/src/crypto/cn/asm/CryptonightR_soft_aes_template.inc
+++ b/src/crypto/cn/asm/CryptonightR_soft_aes_template.inc
@@ -0,0 +1,281 @@
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_part1):
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	pxor xmm6, xmm1
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+	movaps xmm0, xmm5
+	psrldq xmm0, 8
+	movd r9d, xmm0
+
+FN_PREFIX(CryptonightR_soft_aes_template_part2):
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov edi, edi
+	shl rbp, 32
+	or rbp, rdi
+	xor r8, rbp
+
+	mov ebx, ebx
+	shl rsi, 32
+	or rsi, rbx
+	xor QWORD PTR [rsp+320], rsi
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor xmm6, xmm1
+	paddq	xmm1, xmm7
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+
+FN_PREFIX(CryptonightR_soft_aes_template_part3):
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+FN_PREFIX(CryptonightR_soft_aes_template_end):
--- a/src/crypto/cn/asm/CryptonightR_soft_aes_template_win.inc
+++ b/src/crypto/cn/asm/CryptonightR_soft_aes_template_win.inc
@@ -0,0 +1,281 @@
+PUBLIC CryptonightR_soft_aes_template_part1
+PUBLIC CryptonightR_soft_aes_template_mainloop
+PUBLIC CryptonightR_soft_aes_template_part2
+PUBLIC CryptonightR_soft_aes_template_part3
+PUBLIC CryptonightR_soft_aes_template_end
+
+ALIGN(64)
+CryptonightR_soft_aes_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+CryptonightR_soft_aes_template_mainloop:
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	pxor xmm6, xmm1
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+	movaps xmm0, xmm5
+	psrldq xmm0, 8
+	movd r9d, xmm0
+
+CryptonightR_soft_aes_template_part2:
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov edi, edi
+	shl rbp, 32
+	or rbp, rdi
+	xor r8, rbp
+
+	mov ebx, ebx
+	shl rsi, 32
+	or rsi, rbx
+	xor QWORD PTR [rsp+320], rsi
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor xmm6, xmm1
+	paddq	xmm1, xmm7
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	CryptonightR_soft_aes_template_mainloop
+
+CryptonightR_soft_aes_template_part3:
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+CryptonightR_soft_aes_template_end:
--- a/src/crypto/cn/asm/CryptonightR_template.S
+++ b/src/crypto/cn/asm/CryptonightR_template.S
--- a/src/crypto/cn/asm/CryptonightR_template.asm
+++ b/src/crypto/cn/asm/CryptonightR_template.asm
--- a/src/crypto/cn/asm/CryptonightR_template.h
+++ b/src/crypto/cn/asm/CryptonightR_template.h
--- a/src/crypto/cn/asm/CryptonightR_template.inc
+++ b/src/crypto/cn/asm/CryptonightR_template.inc
@@ -0,0 +1,536 @@
+PUBLIC FN_PREFIX(CryptonightR_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_end)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
+PUBLIC FN_PREFIX(CryptonightR_template_double_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_part1):
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_mainloop):
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+
+	mov	r13d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r13d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movaps xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [r13+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	pxor xmm0, xmm2
+	pxor xmm5, xmm1
+	pxor xmm5, xmm0
+
+	movq	r12, xmm5
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	paddq	xmm3, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r13+r11], xmm3
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+FN_PREFIX(CryptonightR_template_part2):
+	lea	rcx, [r10+r11]
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor rsp, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov	rax, r13
+	mul	r12
+	add	r15, rax
+	add	rsp, rdx
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	movaps xmm3, xmm1
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	movdqa	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm1, xmm2
+	pxor xmm5, xmm0
+	pxor xmm5, xmm1
+	paddq	xmm3, xmm4
+	paddq	xmm2, xmm6
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqu	XMMWORD PTR [r12+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm3
+
+	movdqa	xmm7, xmm6
+	mov	QWORD PTR [rcx], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [rcx+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	FN_PREFIX(CryptonightR_template_mainloop)
+
+FN_PREFIX(CryptonightR_template_part3):
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+FN_PREFIX(CryptonightR_template_end):
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_part1):
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_mainloop):
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm1
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	movq	rdx, xmm6
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm1
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+FN_PREFIX(CryptonightR_template_double_part2):
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r14, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r12, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rcx+rsi]
+	pxor	xmm6, xmm1
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	pxor	xmm6, xmm2
+	paddq	xmm2, xmm3
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	pxor	xmm6, xmm0
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+FN_PREFIX(CryptonightR_template_double_part3):
+
+	movq r15, xmm13
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r13, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	mov	rdi, rcx
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm1
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm2
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm0
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	FN_PREFIX(CryptonightR_template_double_mainloop)
+
+FN_PREFIX(CryptonightR_template_double_part4):
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+FN_PREFIX(CryptonightR_template_double_end):
--- a/src/crypto/cn/asm/CryptonightR_template_win.inc
+++ b/src/crypto/cn/asm/CryptonightR_template_win.inc
@@ -0,0 +1,536 @@
+PUBLIC CryptonightR_template_part1
+PUBLIC CryptonightR_template_mainloop
+PUBLIC CryptonightR_template_part2
+PUBLIC CryptonightR_template_part3
+PUBLIC CryptonightR_template_end
+PUBLIC CryptonightR_template_double_part1
+PUBLIC CryptonightR_template_double_mainloop
+PUBLIC CryptonightR_template_double_part2
+PUBLIC CryptonightR_template_double_part3
+PUBLIC CryptonightR_template_double_part4
+PUBLIC CryptonightR_template_double_end
+
+ALIGN(64)
+CryptonightR_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+CryptonightR_template_mainloop:
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+
+	mov	r13d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r13d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movaps xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [r13+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	pxor xmm0, xmm2
+	pxor xmm5, xmm1
+	pxor xmm5, xmm0
+
+	movq	r12, xmm5
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	paddq	xmm3, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r13+r11], xmm3
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+CryptonightR_template_part2:
+	lea	rcx, [r10+r11]
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor rsp, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov	rax, r13
+	mul	r12
+	add	r15, rax
+	add	rsp, rdx
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	movaps xmm3, xmm1
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	movdqa	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm1, xmm2
+	pxor xmm5, xmm0
+	pxor xmm5, xmm1
+	paddq	xmm3, xmm4
+	paddq	xmm2, xmm6
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqu	XMMWORD PTR [r12+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm3
+
+	movdqa	xmm7, xmm6
+	mov	QWORD PTR [rcx], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [rcx+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	CryptonightR_template_mainloop
+
+CryptonightR_template_part3:
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+CryptonightR_template_end:
+
+ALIGN(64)
+CryptonightR_template_double_part1:
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+CryptonightR_template_double_mainloop:
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm1
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	movq	rdx, xmm6
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm1
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+CryptonightR_template_double_part2:
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r14, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r12, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rcx+rsi]
+	pxor	xmm6, xmm1
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	pxor	xmm6, xmm2
+	paddq	xmm2, xmm3
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	pxor	xmm6, xmm0
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+CryptonightR_template_double_part3:
+
+	movq r15, xmm13
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r13, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	mov	rdi, rcx
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm1
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm2
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm0
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	CryptonightR_template_double_mainloop
+
+CryptonightR_template_double_part4:
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+CryptonightR_template_double_end:
--- a/src/crypto/cn/asm/CryptonightWOW_soft_aes_template.inc
+++ b/src/crypto/cn/asm/CryptonightWOW_soft_aes_template.inc
@@ -0,0 +1,268 @@
+PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1)
+PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2)
+PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3)
+PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightWOW_soft_aes_template_part1):
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop):
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+
+FN_PREFIX(CryptonightWOW_soft_aes_template_part2):
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	paddq	xmm1, xmm7
+	movq	xmm0, rax
+	movq	xmm3, rdx
+	xor	rax, QWORD PTR [r11+rcx+8]
+	xor	rdx, QWORD PTR [rcx+r11]
+	punpcklqdq xmm3, xmm0
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor	xmm2, xmm3
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
+
+FN_PREFIX(CryptonightWOW_soft_aes_template_part3):
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+FN_PREFIX(CryptonightWOW_soft_aes_template_end):
--- a/src/crypto/cn/asm/CryptonightWOW_soft_aes_template_win.inc
+++ b/src/crypto/cn/asm/CryptonightWOW_soft_aes_template_win.inc
@@ -0,0 +1,268 @@
+PUBLIC CryptonightWOW_soft_aes_template_part1
+PUBLIC CryptonightWOW_soft_aes_template_mainloop
+PUBLIC CryptonightWOW_soft_aes_template_part2
+PUBLIC CryptonightWOW_soft_aes_template_part3
+PUBLIC CryptonightWOW_soft_aes_template_end
+
+ALIGN(64)
+CryptonightWOW_soft_aes_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+CryptonightWOW_soft_aes_template_mainloop:
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+
+CryptonightWOW_soft_aes_template_part2:
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	paddq	xmm1, xmm7
+	movq	xmm0, rax
+	movq	xmm3, rdx
+	xor	rax, QWORD PTR [r11+rcx+8]
+	xor	rdx, QWORD PTR [rcx+r11]
+	punpcklqdq xmm3, xmm0
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor	xmm2, xmm3
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	CryptonightWOW_soft_aes_template_mainloop
+
+CryptonightWOW_soft_aes_template_part3:
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+CryptonightWOW_soft_aes_template_end:
--- a/src/crypto/cn/asm/CryptonightWOW_template.inc
+++ b/src/crypto/cn/asm/CryptonightWOW_template.inc
@@ -0,0 +1,491 @@
+PUBLIC FN_PREFIX(CryptonightWOW_template_part1)
+PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightWOW_template_part2)
+PUBLIC FN_PREFIX(CryptonightWOW_template_part3)
+PUBLIC FN_PREFIX(CryptonightWOW_template_end)
+PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1)
+PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop)
+PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2)
+PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3)
+PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4)
+PUBLIC FN_PREFIX(CryptonightWOW_template_double_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightWOW_template_part1):
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightWOW_template_mainloop):
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	mov	r12d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r12d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movdqu	xmm2, XMMWORD PTR [r12+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	paddq	xmm0, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r12+r11], xmm0
+	movq	r12, xmm5
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+FN_PREFIX(CryptonightWOW_template_part2):
+	mov	rax, r13
+	mul	r12
+	movq	xmm0, rax
+	movq	xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	xor	rdx, QWORD PTR [r12+r11]
+	xor	rax, QWORD PTR [r11+r12+8]
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	pxor	xmm3, xmm2
+	paddq	xmm7, XMMWORD PTR [r10+r11]
+	paddq	xmm1, xmm4
+	paddq	xmm3, xmm6
+	movdqu	XMMWORD PTR [r9+r11], xmm7
+	movdqu	XMMWORD PTR [r12+r11], xmm3
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+
+	movdqa	xmm7, xmm6
+	add	r15, rax
+	add	rsp, rdx
+	xor	r10, 48
+	mov	QWORD PTR [r10+r11], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [r10+r11+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	FN_PREFIX(CryptonightWOW_template_mainloop)
+
+FN_PREFIX(CryptonightWOW_template_part3):
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+FN_PREFIX(CryptonightWOW_template_end):
+
+ALIGN(64)
+FN_PREFIX(CryptonightWOW_template_double_part1):
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightWOW_template_double_mainloop):
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	rdx, xmm6
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+FN_PREFIX(CryptonightWOW_template_double_part2):
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movq	xmm1, rdx
+	movq	xmm0, r8
+	punpcklqdq xmm1, xmm0
+	pxor	xmm1, XMMWORD PTR [rcx+rsi]
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	xor	rdx, QWORD PTR [rsi+rcx]
+	paddq	xmm2, xmm3
+	xor	r8, QWORD PTR [rsi+rcx+8]
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+FN_PREFIX(CryptonightWOW_template_double_part3):
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	movq r15, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	movq	xmm1, rdx
+	movq	xmm0, rax
+	punpcklqdq xmm1, xmm0
+	mov	rdi, rcx
+	mov	r8, rax
+	pxor	xmm1, XMMWORD PTR [rbp+rcx]
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	xor	r8, QWORD PTR [rbp+rcx+8]
+	xor	rdx, QWORD PTR [rbp+rcx]
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	FN_PREFIX(CryptonightWOW_template_double_mainloop)
+
+FN_PREFIX(CryptonightWOW_template_double_part4):
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+FN_PREFIX(CryptonightWOW_template_double_end):
--- a/src/crypto/cn/asm/CryptonightWOW_template_win.inc
+++ b/src/crypto/cn/asm/CryptonightWOW_template_win.inc
@@ -0,0 +1,491 @@
+PUBLIC CryptonightWOW_template_part1
+PUBLIC CryptonightWOW_template_mainloop
+PUBLIC CryptonightWOW_template_part2
+PUBLIC CryptonightWOW_template_part3
+PUBLIC CryptonightWOW_template_end
+PUBLIC CryptonightWOW_template_double_part1
+PUBLIC CryptonightWOW_template_double_mainloop
+PUBLIC CryptonightWOW_template_double_part2
+PUBLIC CryptonightWOW_template_double_part3
+PUBLIC CryptonightWOW_template_double_part4
+PUBLIC CryptonightWOW_template_double_end
+
+ALIGN(64)
+CryptonightWOW_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+CryptonightWOW_template_mainloop:
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	mov	r12d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r12d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movdqu	xmm2, XMMWORD PTR [r12+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	paddq	xmm0, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r12+r11], xmm0
+	movq	r12, xmm5
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+CryptonightWOW_template_part2:
+	mov	rax, r13
+	mul	r12
+	movq	xmm0, rax
+	movq	xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	xor	rdx, QWORD PTR [r12+r11]
+	xor	rax, QWORD PTR [r11+r12+8]
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	pxor	xmm3, xmm2
+	paddq	xmm7, XMMWORD PTR [r10+r11]
+	paddq	xmm1, xmm4
+	paddq	xmm3, xmm6
+	movdqu	XMMWORD PTR [r9+r11], xmm7
+	movdqu	XMMWORD PTR [r12+r11], xmm3
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+
+	movdqa	xmm7, xmm6
+	add	r15, rax
+	add	rsp, rdx
+	xor	r10, 48
+	mov	QWORD PTR [r10+r11], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [r10+r11+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	CryptonightWOW_template_mainloop
+
+CryptonightWOW_template_part3:
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+CryptonightWOW_template_end:
+
+ALIGN(64)
+CryptonightWOW_template_double_part1:
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+CryptonightWOW_template_double_mainloop:
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	rdx, xmm6
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+CryptonightWOW_template_double_part2:
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movq	xmm1, rdx
+	movq	xmm0, r8
+	punpcklqdq xmm1, xmm0
+	pxor	xmm1, XMMWORD PTR [rcx+rsi]
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	xor	rdx, QWORD PTR [rsi+rcx]
+	paddq	xmm2, xmm3
+	xor	r8, QWORD PTR [rsi+rcx+8]
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+CryptonightWOW_template_double_part3:
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	movq r15, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	movq	xmm1, rdx
+	movq	xmm0, rax
+	punpcklqdq xmm1, xmm0
+	mov	rdi, rcx
+	mov	r8, rax
+	pxor	xmm1, XMMWORD PTR [rbp+rcx]
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	xor	r8, QWORD PTR [rbp+rcx+8]
+	xor	rdx, QWORD PTR [rbp+rcx]
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	CryptonightWOW_template_double_mainloop
+
+CryptonightWOW_template_double_part4:
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+CryptonightWOW_template_double_end:
--- a/src/crypto/cn/asm/cn2/cnv2_double_main_loop_sandybridge.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_double_main_loop_sandybridge.inc
@@ -0,0 +1,413 @@
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN(64)
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
--- a/src/crypto/cn/asm/cn2/cnv2_main_loop_bulldozer.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_main_loop_bulldozer.inc
@@ -0,0 +1,182 @@
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+cnv2_main_loop_bulldozer:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq xmm6, r8
+	pinsrq xmm6, r11, 1
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+
+	mov edi, 1023
+	shl rdi, 52
+
+	movq	r14, xmm5
+	pextrq rax, xmm5, 1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	div	r9
+	mov	eax, eax
+	shl	rdx, 32
+	lea	r15, [rax+rdx]
+	lea	rax, [r14+r15]
+	shr	rax, 12
+	add	rax, rdi
+	movq	xmm0, rax
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_bulldozer
+	shr	rdi, 19
+
+sqrt_fixup_bulldozer_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	cnv2_main_loop_bulldozer
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_bulldozer_endp
+
+sqrt_fixup_bulldozer:
+	movq r9, xmm5
+	add r9, r15
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_bulldozer_ret
+
+cnv2_main_loop_bulldozer_endp:
--- a/src/crypto/cn/asm/cn2/cnv2_main_loop_ivybridge.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_main_loop_ivybridge.inc
@@ -0,0 +1,188 @@
+	mov	rcx, [rcx]
+
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN(64)
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
--- a/src/crypto/cn/asm/cn2/cnv2_main_loop_ryzen.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_main_loop_ryzen.inc
@@ -0,0 +1,181 @@
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
--- a/src/crypto/cn/asm/cn2/cnv2_rwz_double_main_loop.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_rwz_double_main_loop.inc
@@ -0,0 +1,413 @@
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 393216
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN(64)
+rwz_main_loop_double:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm3
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r8+r13], xmm0
+	xor	r8d, 32
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	rwz_div_fix_1
+rwz_div_fix_1_ret:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	rwz_div_fix_2
+rwz_div_fix_2_ret:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	rwz_sqrt_fix_1
+rwz_sqrt_fix_1_ret:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	rwz_sqrt_fix_2
+rwz_sqrt_fix_2_ret:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm3, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm3
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	rwz_main_loop_double
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp rwz_cnv2_double_mainloop_asm_endp
+
+rwz_div_fix_1:
+	dec	rbx
+	add	r11, rdx
+	jmp	rwz_div_fix_1_ret
+
+rwz_div_fix_2:
+	dec	rdx
+	add	r8, r9
+	jmp	rwz_div_fix_2_ret
+
+rwz_sqrt_fix_1:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	rwz_sqrt_fix_1_ret
+
+rwz_sqrt_fix_2:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	rwz_sqrt_fix_2_ret
+
+rwz_cnv2_double_mainloop_asm_endp:
--- a/src/crypto/cn/asm/cn2/cnv2_rwz_main_loop.inc
+++ b/src/crypto/cn/asm/cn2/cnv2_rwz_main_loop.inc
@@ -0,0 +1,188 @@
+	mov	rcx, [rcx]
+
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 393216
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN(64)
+rwz_main_loop:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm0, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm2, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	paddq	 xmm1, xmm7
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 rwz_sqrt_fixup
+	psrlq	 xmm3, 19
+rwz_sqrt_fixup_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm4
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm5
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm2
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 rwz_main_loop
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_rwz_main_loop_endp
+
+rwz_sqrt_fixup:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 rwz_sqrt_fixup_ret
+
+cnv2_rwz_main_loop_endp:
--- a/src/crypto/cn/asm/cn_main_loop.S
+++ b/src/crypto/cn/asm/cn_main_loop.S
@@ -0,0 +1,73 @@
+#ifdef __APPLE__
+#   define ALIGN(x) .align 6
+#else
+#   define ALIGN(x) .align 64
+#endif
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
+.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
+.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
+.global FN_PREFIX(cnv2_rwz_mainloop_asm)
+.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_ivybridge.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_ryzen.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_bulldozer_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_bulldozer.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_double_main_loop_sandybridge.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_rwz_mainloop_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_rwz_main_loop.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_rwz_double_mainloop_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_rwz_double_main_loop.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
--- a/src/crypto/cn/asm/cn_main_loop.asm
+++ b/src/crypto/cn/asm/cn_main_loop.asm
@@ -0,0 +1,52 @@
+_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cnv2_mainloop_ivybridge_asm
+PUBLIC cnv2_mainloop_ryzen_asm
+PUBLIC cnv2_mainloop_bulldozer_asm
+PUBLIC cnv2_double_mainloop_sandybridge_asm
+PUBLIC cnv2_rwz_mainloop_asm
+PUBLIC cnv2_rwz_double_mainloop_asm
+
+ALIGN(64)
+cnv2_mainloop_ivybridge_asm PROC
+	INCLUDE cn2/cnv2_main_loop_ivybridge.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_mainloop_ivybridge_asm ENDP
+
+ALIGN(64)
+cnv2_mainloop_ryzen_asm PROC
+	INCLUDE cn2/cnv2_main_loop_ryzen.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_mainloop_ryzen_asm ENDP
+
+ALIGN(64)
+cnv2_mainloop_bulldozer_asm PROC
+	INCLUDE cn2/cnv2_main_loop_bulldozer.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_mainloop_bulldozer_asm ENDP
+
+ALIGN(64)
+cnv2_double_mainloop_sandybridge_asm PROC
+	INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_double_mainloop_sandybridge_asm ENDP
+
+ALIGN(64)
+cnv2_rwz_mainloop_asm PROC
+	INCLUDE cn2/cnv2_rwz_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_rwz_mainloop_asm ENDP
+
+ALIGN(64)
+cnv2_rwz_double_mainloop_asm PROC
+	INCLUDE cn2/cnv2_rwz_double_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_rwz_double_mainloop_asm ENDP
+
+_TEXT_CNV2_MAINLOOP ENDS
+END
--- a/src/crypto/cn/asm/win64/CryptonightR_soft_aes_template_win.inc
+++ b/src/crypto/cn/asm/win64/CryptonightR_soft_aes_template_win.inc
@@ -0,0 +1,281 @@
+PUBLIC CryptonightR_soft_aes_template_part1
+PUBLIC CryptonightR_soft_aes_template_mainloop
+PUBLIC CryptonightR_soft_aes_template_part2
+PUBLIC CryptonightR_soft_aes_template_part3
+PUBLIC CryptonightR_soft_aes_template_end
+
+ALIGN(64)
+CryptonightR_soft_aes_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movd	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movd	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movd	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movd	xmm10, QWORD PTR [r10+96]
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movd	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movd xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+CryptonightR_soft_aes_template_mainloop:
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movd	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movd	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movd	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	pxor xmm6, xmm1
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movd rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movd	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+	movaps xmm0, xmm5
+	psrldq xmm0, 8
+	movd r9d, xmm0
+
+CryptonightR_soft_aes_template_part2:
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov edi, edi
+	shl rbp, 32
+	or rbp, rdi
+	xor r8, rbp
+
+	mov ebx, ebx
+	shl rsi, 32
+	or rsi, rbx
+	xor QWORD PTR [rsp+320], rsi
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor xmm6, xmm1
+	paddq	xmm1, xmm7
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	CryptonightR_soft_aes_template_mainloop
+
+CryptonightR_soft_aes_template_part3:
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+CryptonightR_soft_aes_template_end:
--- a/src/crypto/cn/asm/win64/CryptonightR_template.asm
+++ b/src/crypto/cn/asm/win64/CryptonightR_template.asm
--- a/src/crypto/cn/asm/win64/CryptonightR_template_win.inc
+++ b/src/crypto/cn/asm/win64/CryptonightR_template_win.inc
@@ -0,0 +1,536 @@
+PUBLIC CryptonightR_template_part1
+PUBLIC CryptonightR_template_mainloop
+PUBLIC CryptonightR_template_part2
+PUBLIC CryptonightR_template_part3
+PUBLIC CryptonightR_template_end
+PUBLIC CryptonightR_template_double_part1
+PUBLIC CryptonightR_template_double_mainloop
+PUBLIC CryptonightR_template_double_part2
+PUBLIC CryptonightR_template_double_part3
+PUBLIC CryptonightR_template_double_part4
+PUBLIC CryptonightR_template_double_end
+
+ALIGN(64)
+CryptonightR_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movd	xmm0, r12
+	movd	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movd	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+CryptonightR_template_mainloop:
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movd	xmm0, r15
+	movd	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+
+	mov	r13d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r13d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movaps xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [r13+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	pxor xmm0, xmm2
+	pxor xmm5, xmm1
+	pxor xmm5, xmm0
+
+	movd	r12, xmm5
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	paddq	xmm3, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r13+r11], xmm3
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+CryptonightR_template_part2:
+	lea	rcx, [r10+r11]
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor rsp, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov	rax, r13
+	mul	r12
+	add	r15, rax
+	add	rsp, rdx
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	movaps xmm3, xmm1
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	movdqa	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm1, xmm2
+	pxor xmm5, xmm0
+	pxor xmm5, xmm1
+	paddq	xmm3, xmm4
+	paddq	xmm2, xmm6
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqu	XMMWORD PTR [r12+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm3
+
+	movdqa	xmm7, xmm6
+	mov	QWORD PTR [rcx], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [rcx+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	CryptonightR_template_mainloop
+
+CryptonightR_template_part3:
+	movd	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+CryptonightR_template_end:
+
+ALIGN(64)
+CryptonightR_template_double_part1:
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movd	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movd	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movd	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movd	xmm0, rcx
+	mov	r11d, 524288
+	movd	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movd xmm14, QWORD PTR [rsp+128]
+	movd xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+CryptonightR_template_double_mainloop:
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movd	xmm0, r12
+	mov	ecx, ebx
+	movd	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movd	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm1
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	movd	rdx, xmm6
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movd	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm1
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movd	rdi, xmm5
+	movd	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movd xmm0, rsp
+	movd xmm1, rsi
+	movd xmm2, rdi
+	movd xmm11, rbp
+	movd xmm12, r15
+	movd xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+CryptonightR_template_double_part2:
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r14, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r12, rax
+
+	movd rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movd rsi, xmm1
+	movd rdi, xmm2
+	movd rbp, xmm11
+	movd r15, xmm12
+	movd rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rcx+rsi]
+	pxor	xmm6, xmm1
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	pxor	xmm6, xmm2
+	paddq	xmm2, xmm3
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	pxor	xmm6, xmm0
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movd rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movd xmm0, rsp
+	movd xmm1, rbx
+	movd xmm2, rsi
+	movd xmm11, rdi
+	movd xmm12, rbp
+	movd xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movd xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+CryptonightR_template_double_part3:
+
+	movd r15, xmm13
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r13, rax
+
+	movd rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movd rbx, xmm1
+	movd rsi, xmm2
+	movd rdi, xmm11
+	movd rbp, xmm12
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	mov	rdi, rcx
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm1
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm2
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm0
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movd rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	CryptonightR_template_double_mainloop
+
+CryptonightR_template_double_part4:
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+CryptonightR_template_double_end:
--- a/src/crypto/cn/asm/win64/CryptonightWOW_soft_aes_template_win.inc
+++ b/src/crypto/cn/asm/win64/CryptonightWOW_soft_aes_template_win.inc
@@ -0,0 +1,268 @@
+PUBLIC CryptonightWOW_soft_aes_template_part1
+PUBLIC CryptonightWOW_soft_aes_template_mainloop
+PUBLIC CryptonightWOW_soft_aes_template_part2
+PUBLIC CryptonightWOW_soft_aes_template_part3
+PUBLIC CryptonightWOW_soft_aes_template_end
+
+ALIGN(64)
+CryptonightWOW_soft_aes_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movd	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movd	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movd	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movd	xmm10, QWORD PTR [r10+96]
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movd	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movd xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+CryptonightWOW_soft_aes_template_mainloop:
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movd	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movd	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movd	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movd rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movd	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+
+CryptonightWOW_soft_aes_template_part2:
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	paddq	xmm1, xmm7
+	movd	xmm0, rax
+	movd	xmm3, rdx
+	xor	rax, QWORD PTR [r11+rcx+8]
+	xor	rdx, QWORD PTR [rcx+r11]
+	punpcklqdq xmm3, xmm0
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor	xmm2, xmm3
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	CryptonightWOW_soft_aes_template_mainloop
+
+CryptonightWOW_soft_aes_template_part3:
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+CryptonightWOW_soft_aes_template_end:
--- a/src/crypto/cn/asm/win64/CryptonightWOW_template_win.inc
+++ b/src/crypto/cn/asm/win64/CryptonightWOW_template_win.inc
@@ -0,0 +1,491 @@
+PUBLIC CryptonightWOW_template_part1
+PUBLIC CryptonightWOW_template_mainloop
+PUBLIC CryptonightWOW_template_part2
+PUBLIC CryptonightWOW_template_part3
+PUBLIC CryptonightWOW_template_end
+PUBLIC CryptonightWOW_template_double_part1
+PUBLIC CryptonightWOW_template_double_mainloop
+PUBLIC CryptonightWOW_template_double_part2
+PUBLIC CryptonightWOW_template_double_part3
+PUBLIC CryptonightWOW_template_double_part4
+PUBLIC CryptonightWOW_template_double_end
+
+ALIGN(64)
+CryptonightWOW_template_part1:
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movd	xmm0, r12
+	movd	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movd	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+CryptonightWOW_template_mainloop:
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movd	xmm0, r15
+	movd	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	mov	r12d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r12d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movdqu	xmm2, XMMWORD PTR [r12+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	paddq	xmm0, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r12+r11], xmm0
+	movd	r12, xmm5
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+CryptonightWOW_template_part2:
+	mov	rax, r13
+	mul	r12
+	movd	xmm0, rax
+	movd	xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	xor	rdx, QWORD PTR [r12+r11]
+	xor	rax, QWORD PTR [r11+r12+8]
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	pxor	xmm3, xmm2
+	paddq	xmm7, XMMWORD PTR [r10+r11]
+	paddq	xmm1, xmm4
+	paddq	xmm3, xmm6
+	movdqu	XMMWORD PTR [r9+r11], xmm7
+	movdqu	XMMWORD PTR [r12+r11], xmm3
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+
+	movdqa	xmm7, xmm6
+	add	r15, rax
+	add	rsp, rdx
+	xor	r10, 48
+	mov	QWORD PTR [r10+r11], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [r10+r11+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	CryptonightWOW_template_mainloop
+
+CryptonightWOW_template_part3:
+	movd	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+CryptonightWOW_template_end:
+
+ALIGN(64)
+CryptonightWOW_template_double_part1:
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movd	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movd	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movd	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movd	xmm0, rcx
+	mov	r11d, 524288
+	movd	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movd xmm14, QWORD PTR [rsp+128]
+	movd xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+CryptonightWOW_template_double_mainloop:
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movd	xmm0, r12
+	mov	ecx, ebx
+	movd	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movd	rdx, xmm6
+	movd	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movd	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movd	rdi, xmm5
+	movd	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movd xmm0, rsp
+	movd xmm1, rsi
+	movd xmm2, rdi
+	movd xmm11, rbp
+	movd xmm12, r15
+	movd xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+CryptonightWOW_template_double_part2:
+
+	movd rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movd rsi, xmm1
+	movd rdi, xmm2
+	movd rbp, xmm11
+	movd r15, xmm12
+	movd rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movd	xmm1, rdx
+	movd	xmm0, r8
+	punpcklqdq xmm1, xmm0
+	pxor	xmm1, XMMWORD PTR [rcx+rsi]
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	xor	rdx, QWORD PTR [rsi+rcx]
+	paddq	xmm2, xmm3
+	xor	r8, QWORD PTR [rsi+rcx+8]
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movd rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movd xmm0, rsp
+	movd xmm1, rbx
+	movd xmm2, rsi
+	movd xmm11, rdi
+	movd xmm12, rbp
+	movd xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movd xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+CryptonightWOW_template_double_part3:
+
+	movd rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movd rbx, xmm1
+	movd rsi, xmm2
+	movd rdi, xmm11
+	movd rbp, xmm12
+	movd r15, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	movd	xmm1, rdx
+	movd	xmm0, rax
+	punpcklqdq xmm1, xmm0
+	mov	rdi, rcx
+	mov	r8, rax
+	pxor	xmm1, XMMWORD PTR [rbp+rcx]
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	xor	r8, QWORD PTR [rbp+rcx+8]
+	xor	rdx, QWORD PTR [rbp+rcx]
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movd rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	CryptonightWOW_template_double_mainloop
+
+CryptonightWOW_template_double_part4:
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+CryptonightWOW_template_double_end:
--- a/src/crypto/cn/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc
@@ -0,0 +1,413 @@
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movd	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movd	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movd	xmm5, QWORD PTR [r8+104]
+	movd	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movd xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movd	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movd	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN(64)
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movd	xmm0, r11
+	movd	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movd	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movd	xmm0, rbp
+	movd	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movd	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movd	rdx, xmm5
+	shl	rdx, 32
+	movd	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movd xmm0, rdx
+	xor rdx, [r11+r13]
+	movd xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movd	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movd	r11, xmm0
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movd	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movd	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movd	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movd	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movd r9, xmm10
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movd xmm0, rax
+	movd xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movd	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movd xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movd	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movd xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
--- a/src/crypto/cn/asm/win64/cn2/cnv2_main_loop_bulldozer.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_main_loop_bulldozer.inc
@@ -0,0 +1,182 @@
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+cnv2_main_loop_bulldozer:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd xmm6, r8
+	pinsrq xmm6, r11, 1
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+
+	mov edi, 1023
+	shl rdi, 52
+
+	movd	r14, xmm5
+	pextrq rax, xmm5, 1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	div	r9
+	mov	eax, eax
+	shl	rdx, 32
+	lea	r15, [rax+rdx]
+	lea	rax, [r14+r15]
+	shr	rax, 12
+	add	rax, rdi
+	movd	xmm0, rax
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_bulldozer
+	shr	rdi, 19
+
+sqrt_fixup_bulldozer_ret:
+	mov	rax, rsi
+	mul	r14
+	movd xmm1, rax
+	movd xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	cnv2_main_loop_bulldozer
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_bulldozer_endp
+
+sqrt_fixup_bulldozer:
+	movd r9, xmm5
+	add r9, r15
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_bulldozer_ret
+
+cnv2_main_loop_bulldozer_endp:
--- a/src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ivybridge.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ivybridge.inc
@@ -0,0 +1,188 @@
+	mov	rcx, [rcx]
+
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movd	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN(64)
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movd	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movd	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movd	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movd xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movd xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
--- a/src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ryzen.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_main_loop_ryzen.inc
@@ -0,0 +1,181 @@
+	mov	rcx, [rcx]
+
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd	xmm0, r11
+	movd	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movd	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movd	rax, xmm0
+
+	div	r9
+	movd xmm0, rax
+	movd xmm1, rdx
+	punpckldq xmm0, xmm1
+	movd r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movd xmm1, rax
+	movd xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movd r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
--- a/src/crypto/cn/asm/win64/cn2/cnv2_rwz_double_main_loop.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_rwz_double_main_loop.inc
@@ -0,0 +1,413 @@
+	mov	rdx, [rcx+8]
+	mov	rcx, [rcx]
+
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 393216
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movd	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movd	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movd	xmm5, QWORD PTR [r8+104]
+	movd	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movd xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movd	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movd	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN(64)
+rwz_main_loop_double:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movd	xmm0, r11
+	movd	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movd	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movd	xmm0, rbp
+	movd	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movd	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movd	rdx, xmm5
+	shl	rdx, 32
+	movd	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movd xmm0, rdx
+	xor rdx, [r11+r13]
+	movd xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm3
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r8+r13], xmm0
+	xor	r8d, 32
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movd	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movd	r11, xmm0
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movd	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movd	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movd	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	rwz_div_fix_1
+rwz_div_fix_1_ret:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	rwz_div_fix_2
+rwz_div_fix_2_ret:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movd	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	rwz_sqrt_fix_1
+rwz_sqrt_fix_1_ret:
+
+	movd r9, xmm10
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	test	r8, 524287
+	je	rwz_sqrt_fix_2
+rwz_sqrt_fix_2_ret:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movd xmm0, rax
+	movd xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm3, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm3
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	rwz_main_loop_double
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp rwz_cnv2_double_mainloop_asm_endp
+
+rwz_div_fix_1:
+	dec	rbx
+	add	r11, rdx
+	jmp	rwz_div_fix_1_ret
+
+rwz_div_fix_2:
+	dec	rdx
+	add	r8, r9
+	jmp	rwz_div_fix_2_ret
+
+rwz_sqrt_fix_1:
+	movd	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movd xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	rwz_sqrt_fix_1_ret
+
+rwz_sqrt_fix_2:
+	psrldq	xmm3, 8
+	movd	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movd xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	rwz_sqrt_fix_2_ret
+
+rwz_cnv2_double_mainloop_asm_endp:
--- a/src/crypto/cn/asm/win64/cn2/cnv2_rwz_main_loop.inc
+++ b/src/crypto/cn/asm/win64/cn2/cnv2_rwz_main_loop.inc
@@ -0,0 +1,188 @@
+	mov	rcx, [rcx]
+
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 393216
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movd	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN(64)
+rwz_main_loop:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movd	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm0, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm2, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	paddq	 xmm1, xmm7
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movd	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movd	 rdx, xmm3
+	test	 edx, 524287
+	je	 rwz_sqrt_fixup
+	psrlq	 xmm3, 19
+rwz_sqrt_fixup_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movd xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movd xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm4
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm5
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm2
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 rwz_main_loop
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_rwz_main_loop_endp
+
+rwz_sqrt_fixup:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 rwz_sqrt_fixup_ret
+
+cnv2_rwz_main_loop_endp:
--- a/src/crypto/cn/asm/win64/cn_main_loop.S
+++ b/src/crypto/cn/asm/win64/cn_main_loop.S
@@ -0,0 +1,45 @@
+#define ALIGN(x) .align 64
+.intel_syntax noprefix
+.section .text
+.global cnv2_mainloop_ivybridge_asm
+.global cnv2_mainloop_ryzen_asm
+.global cnv2_mainloop_bulldozer_asm
+.global cnv2_double_mainloop_sandybridge_asm
+.global cnv2_rwz_mainloop_asm
+.global cnv2_rwz_double_mainloop_asm
+
+ALIGN(64)
+cnv2_mainloop_ivybridge_asm:
+	#include "../cn2/cnv2_main_loop_ivybridge.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_ryzen_asm:
+	#include "../cn2/cnv2_main_loop_ryzen.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_bulldozer_asm:
+	#include "../cn2/cnv2_main_loop_bulldozer.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_double_mainloop_sandybridge_asm:
+	#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_rwz_mainloop_asm:
+	#include "cn2/cnv2_rwz_main_loop.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_rwz_double_mainloop_asm:
+	#include "cn2/cnv2_rwz_double_main_loop.inc"
+	ret 0
+	mov eax, 3735929054
--- a/src/crypto/cn/asm/win64/cn_main_loop.asm
+++ b/src/crypto/cn/asm/win64/cn_main_loop.asm
@@ -0,0 +1,52 @@
+_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cnv2_mainloop_ivybridge_asm
+PUBLIC cnv2_mainloop_ryzen_asm
+PUBLIC cnv2_mainloop_bulldozer_asm
+PUBLIC cnv2_double_mainloop_sandybridge_asm
+PUBLIC cnv2_rwz_mainloop_asm
+PUBLIC cnv2_rwz_double_mainloop_asm
+
+ALIGN 64
+cnv2_mainloop_ivybridge_asm PROC
+	INCLUDE cn2/cnv2_main_loop_ivybridge.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_mainloop_ivybridge_asm ENDP
+
+ALIGN 64
+cnv2_mainloop_ryzen_asm PROC
+	INCLUDE cn2/cnv2_main_loop_ryzen.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_mainloop_ryzen_asm ENDP
+
+ALIGN 64
+cnv2_mainloop_bulldozer_asm PROC
+	INCLUDE cn2/cnv2_main_loop_bulldozer.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_mainloop_bulldozer_asm ENDP
+
+ALIGN 64
+cnv2_double_mainloop_sandybridge_asm PROC
+	INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_double_mainloop_sandybridge_asm ENDP
+
+ALIGN(64)
+cnv2_rwz_mainloop_asm PROC
+	INCLUDE cn2/cnv2_rwz_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_rwz_mainloop_asm ENDP
+
+ALIGN(64)
+cnv2_rwz_double_mainloop_asm PROC
+	INCLUDE cn2/cnv2_rwz_double_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv2_rwz_double_mainloop_asm ENDP
+
+_TEXT_CNV2_MAINLOOP ENDS
+END
--- a/src/crypto/cn/c_blake256.c
+++ b/src/crypto/cn/c_blake256.c
@@ -0,0 +1,326 @@
+/*
+ * The blake256_* and blake224_* functions are largely copied from
+ * blake256_light.c and blake224_light.c from the BLAKE website:
+ *
+ *     http://131002.net/blake/
+ *
+ * The hmac_* functions implement HMAC-BLAKE-256 and HMAC-BLAKE-224.
+ * HMAC is specified by RFC 2104.
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "c_blake256.h"
+
+#define U8TO32(p) \
+    (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |    \
+     ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+#define U32TO8(p, v) \
+    (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+    (p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+const uint8_t sigma[][16] = {
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
+    {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
+    {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
+    { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8},
+    { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13},
+    { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9},
+    {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11},
+    {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10},
+    { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5},
+    {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0},
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
+    {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
+    {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
+    { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}
+};
+
+const uint32_t cst[16] = {
+    0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+    0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+    0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+    0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+};
+
+static const uint8_t padding[] = {
+    0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+
+void blake256_compress(state *S, const uint8_t *block) {
+    uint32_t v[16], m[16], i;
+
+#define ROT(x,n) (((x)<<(32-n))|((x)>>(n)))
+#define G(a,b,c,d,e)                                      \
+    v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \
+    v[d] = ROT(v[d] ^ v[a],16);                           \
+    v[c] += v[d];                                         \
+    v[b] = ROT(v[b] ^ v[c],12);                           \
+    v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b];   \
+    v[d] = ROT(v[d] ^ v[a], 8);                           \
+    v[c] += v[d];                                         \
+    v[b] = ROT(v[b] ^ v[c], 7);
+
+    for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
+    for (i = 0; i < 8;  ++i) v[i] = S->h[i];
+    v[ 8] = S->s[0] ^ 0x243F6A88;
+    v[ 9] = S->s[1] ^ 0x85A308D3;
+    v[10] = S->s[2] ^ 0x13198A2E;
+    v[11] = S->s[3] ^ 0x03707344;
+    v[12] = 0xA4093822;
+    v[13] = 0x299F31D0;
+    v[14] = 0x082EFA98;
+    v[15] = 0xEC4E6C89;
+
+    if (S->nullt == 0) {
+        v[12] ^= S->t[0];
+        v[13] ^= S->t[0];
+        v[14] ^= S->t[1];
+        v[15] ^= S->t[1];
+    }
+
+    for (i = 0; i < 14; ++i) {
+        G(0, 4,  8, 12,  0);
+        G(1, 5,  9, 13,  2);
+        G(2, 6, 10, 14,  4);
+        G(3, 7, 11, 15,  6);
+        G(3, 4,  9, 14, 14);
+        G(2, 7,  8, 13, 12);
+        G(0, 5, 10, 15,  8);
+        G(1, 6, 11, 12, 10);
+    }
+
+    for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
+    for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+}
+
+void blake256_init(state *S) {
+    S->h[0] = 0x6A09E667;
+    S->h[1] = 0xBB67AE85;
+    S->h[2] = 0x3C6EF372;
+    S->h[3] = 0xA54FF53A;
+    S->h[4] = 0x510E527F;
+    S->h[5] = 0x9B05688C;
+    S->h[6] = 0x1F83D9AB;
+    S->h[7] = 0x5BE0CD19;
+    S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+    S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+}
+
+void blake224_init(state *S) {
+    S->h[0] = 0xC1059ED8;
+    S->h[1] = 0x367CD507;
+    S->h[2] = 0x3070DD17;
+    S->h[3] = 0xF70E5939;
+    S->h[4] = 0xFFC00B31;
+    S->h[5] = 0x68581511;
+    S->h[6] = 0x64F98FA7;
+    S->h[7] = 0xBEFA4FA4;
+    S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+    S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+}
+
+// datalen = number of bits
+void blake256_update(state *S, const uint8_t *data, uint64_t datalen) {
+    int left = S->buflen >> 3;
+    int fill = 64 - left;
+
+    if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
+        memcpy((void *) (S->buf + left), (void *) data, fill);
+        S->t[0] += 512;
+        if (S->t[0] == 0) S->t[1]++;
+        blake256_compress(S, S->buf);
+        data += fill;
+        datalen -= (fill << 3);
+        left = 0;
+    }
+
+    while (datalen >= 512) {
+        S->t[0] += 512;
+        if (S->t[0] == 0) S->t[1]++;
+        blake256_compress(S, data);
+        data += 64;
+        datalen -= 512;
+    }
+
+    if (datalen > 0) {
+        memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+        S->buflen = (left << 3) + (int) datalen;
+    } else {
+        S->buflen = 0;
+    }
+}
+
+// datalen = number of bits
+void blake224_update(state *S, const uint8_t *data, uint64_t datalen) {
+    blake256_update(S, data, datalen);
+}
+
+void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
+    uint8_t msglen[8];
+    uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
+    if (lo < (unsigned) S->buflen) hi++;
+    U32TO8(msglen + 0, hi);
+    U32TO8(msglen + 4, lo);
+
+    if (S->buflen == 440) { /* one padding byte */
+        S->t[0] -= 8;
+        blake256_update(S, &pa, 8);
+    } else {
+        if (S->buflen < 440) { /* enough space to fill the block  */
+            if (S->buflen == 0) S->nullt = 1;
+            S->t[0] -= 440 - S->buflen;
+            blake256_update(S, padding, 440 - S->buflen);
+        } else { /* need 2 compressions */
+            S->t[0] -= 512 - S->buflen;
+            blake256_update(S, padding, 512 - S->buflen);
+            S->t[0] -= 440;
+            blake256_update(S, padding + 1, 440);
+            S->nullt = 1;
+        }
+        blake256_update(S, &pb, 8);
+        S->t[0] -= 8;
+    }
+    S->t[0] -= 64;
+    blake256_update(S, msglen, 64);
+
+    U32TO8(digest +  0, S->h[0]);
+    U32TO8(digest +  4, S->h[1]);
+    U32TO8(digest +  8, S->h[2]);
+    U32TO8(digest + 12, S->h[3]);
+    U32TO8(digest + 16, S->h[4]);
+    U32TO8(digest + 20, S->h[5]);
+    U32TO8(digest + 24, S->h[6]);
+    U32TO8(digest + 28, S->h[7]);
+}
+
+void blake256_final(state *S, uint8_t *digest) {
+    blake256_final_h(S, digest, 0x81, 0x01);
+}
+
+void blake224_final(state *S, uint8_t *digest) {
+    blake256_final_h(S, digest, 0x80, 0x00);
+}
+
+// inlen = number of bytes
+void blake256_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
+    state S;
+    blake256_init(&S);
+    blake256_update(&S, in, inlen * 8);
+    blake256_final(&S, out);
+}
+
+// inlen = number of bytes
+void blake224_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
+    state S;
+    blake224_init(&S);
+    blake224_update(&S, in, inlen * 8);
+    blake224_final(&S, out);
+}
+
+// keylen = number of bytes
+void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
+    const uint8_t *key = _key;
+    uint8_t keyhash[32];
+    uint8_t pad[64];
+    uint64_t i;
+
+    if (keylen > 64) {
+        blake256_hash(keyhash, key, keylen);
+        key = keyhash;
+        keylen = 32;
+    }
+
+    blake256_init(&S->inner);
+    memset(pad, 0x36, 64);
+    for (i = 0; i < keylen; ++i) {
+        pad[i] ^= key[i];
+    }
+    blake256_update(&S->inner, pad, 512);
+
+    blake256_init(&S->outer);
+    memset(pad, 0x5c, 64);
+    for (i = 0; i < keylen; ++i) {
+        pad[i] ^= key[i];
+    }
+    blake256_update(&S->outer, pad, 512);
+
+    memset(keyhash, 0, 32);
+}
+
+// keylen = number of bytes
+void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
+    const uint8_t *key = _key;
+    uint8_t keyhash[32];
+    uint8_t pad[64];
+    uint64_t i;
+
+    if (keylen > 64) {
+        blake256_hash(keyhash, key, keylen);
+        key = keyhash;
+        keylen = 28;
+    }
+
+    blake224_init(&S->inner);
+    memset(pad, 0x36, 64);
+    for (i = 0; i < keylen; ++i) {
+        pad[i] ^= key[i];
+    }
+    blake224_update(&S->inner, pad, 512);
+
+    blake224_init(&S->outer);
+    memset(pad, 0x5c, 64);
+    for (i = 0; i < keylen; ++i) {
+        pad[i] ^= key[i];
+    }
+    blake224_update(&S->outer, pad, 512);
+
+    memset(keyhash, 0, 32);
+}
+
+// datalen = number of bits
+void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
+  // update the inner state
+  blake256_update(&S->inner, data, datalen);
+}
+
+// datalen = number of bits
+void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
+  // update the inner state
+  blake224_update(&S->inner, data, datalen);
+}
+
+void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
+    uint8_t ihash[32];
+    blake256_final(&S->inner, ihash);
+    blake256_update(&S->outer, ihash, 256);
+    blake256_final(&S->outer, digest);
+    memset(ihash, 0, 32);
+}
+
+void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
+    uint8_t ihash[32];
+    blake224_final(&S->inner, ihash);
+    blake224_update(&S->outer, ihash, 224);
+    blake224_final(&S->outer, digest);
+    memset(ihash, 0, 32);
+}
+
+// keylen = number of bytes; inlen = number of bytes
+void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
+    hmac_state S;
+    hmac_blake256_init(&S, key, keylen);
+    hmac_blake256_update(&S, in, inlen * 8);
+    hmac_blake256_final(&S, out);
+}
+
+// keylen = number of bytes; inlen = number of bytes
+void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
+    hmac_state S;
+    hmac_blake224_init(&S, key, keylen);
+    hmac_blake224_update(&S, in, inlen * 8);
+    hmac_blake224_final(&S, out);
+}
--- a/src/crypto/cn/c_blake256.h
+++ b/src/crypto/cn/c_blake256.h
@@ -0,0 +1,43 @@
+#ifndef _BLAKE256_H_
+#define _BLAKE256_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint32_t h[8], s[4], t[2];
+  int buflen, nullt;
+  uint8_t buf[64];
+} state;
+
+typedef struct {
+  state inner;
+  state outer;
+} hmac_state;
+
+void blake256_init(state *);
+void blake224_init(state *);
+
+void blake256_update(state *, const uint8_t *, uint64_t);
+void blake224_update(state *, const uint8_t *, uint64_t);
+
+void blake256_final(state *, uint8_t *);
+void blake224_final(state *, uint8_t *);
+
+void blake256_hash(uint8_t *, const uint8_t *, uint64_t);
+void blake224_hash(uint8_t *, const uint8_t *, uint64_t);
+
+/* HMAC functions: */
+
+void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
+
+void hmac_blake256_update(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake224_update(hmac_state *, const uint8_t *, uint64_t);
+
+void hmac_blake256_final(hmac_state *, uint8_t *);
+void hmac_blake224_final(hmac_state *, uint8_t *);
+
+void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
+void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
+
+#endif /* _BLAKE256_H_ */
--- a/src/crypto/cn/c_groestl.c
+++ b/src/crypto/cn/c_groestl.c
@@ -0,0 +1,360 @@
+/* hash.c     April 2012
+ * Groestl ANSI C code optimised for 32-bit machines
+ * Author: Thomas Krinninger
+ *
+ *  This work is based on the implementation of
+ *          Soeren S. Thomsen and Krystian Matusiewicz
+ *          
+ *
+ */
+
+#include "c_groestl.h"
+#include "groestl_tables.h"
+
+#define P_TYPE 0
+#define Q_TYPE 1
+
+const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}};
+
+const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
+
+
+#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
+															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
+															v1 = temp_var;}
+  
+
+#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
+   tu = T[2*(uint32_t)x[4*c0+0]];			    \
+   tl = T[2*(uint32_t)x[4*c0+0]+1];		    \
+   tv1 = T[2*(uint32_t)x[4*c1+1]];			\
+   tv2 = T[2*(uint32_t)x[4*c1+1]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
+   tu ^= tv1;						\
+   tl ^= tv2;						\
+   tv1 = T[2*(uint32_t)x[4*c2+2]];			\
+   tv2 = T[2*(uint32_t)x[4*c2+2]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
+   tu ^= tv1;						\
+   tl ^= tv2;   					\
+   tv1 = T[2*(uint32_t)x[4*c3+3]];			\
+   tv2 = T[2*(uint32_t)x[4*c3+3]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
+   tu ^= tv1;						\
+   tl ^= tv2;						\
+   tl ^= T[2*(uint32_t)x[4*c4+0]];			\
+   tu ^= T[2*(uint32_t)x[4*c4+0]+1];			\
+   tv1 = T[2*(uint32_t)x[4*c5+1]];			\
+   tv2 = T[2*(uint32_t)x[4*c5+1]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
+   tl ^= tv1;						\
+   tu ^= tv2;						\
+   tv1 = T[2*(uint32_t)x[4*c6+2]];			\
+   tv2 = T[2*(uint32_t)x[4*c6+2]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
+   tl ^= tv1;						\
+   tu ^= tv2;   					\
+   tv1 = T[2*(uint32_t)x[4*c7+3]];			\
+   tv2 = T[2*(uint32_t)x[4*c7+3]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
+   tl ^= tv1;						\
+   tu ^= tv2;						\
+   y[i] = tu;						\
+   y[i+1] = tl;
+
+
+/* compute one round of P (short variants) */
+static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) {
+  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+  uint32_t* x32 = (uint32_t*)x;
+  x32[ 0] ^= 0x00000000^r;
+  x32[ 2] ^= 0x00000010^r;
+  x32[ 4] ^= 0x00000020^r;
+  x32[ 6] ^= 0x00000030^r;
+  x32[ 8] ^= 0x00000040^r;
+  x32[10] ^= 0x00000050^r;
+  x32[12] ^= 0x00000060^r;
+  x32[14] ^= 0x00000070^r;
+  COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+/* compute one round of Q (short variants) */
+static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) {
+  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+  uint32_t* x32 = (uint32_t*)x;
+  x32[ 0] = ~x32[ 0];
+  x32[ 1] ^= 0xffffffff^r;
+  x32[ 2] = ~x32[ 2];
+  x32[ 3] ^= 0xefffffff^r;
+  x32[ 4] = ~x32[ 4];
+  x32[ 5] ^= 0xdfffffff^r;
+  x32[ 6] = ~x32[ 6];
+  x32[ 7] ^= 0xcfffffff^r;
+  x32[ 8] = ~x32[ 8];
+  x32[ 9] ^= 0xbfffffff^r;
+  x32[10] = ~x32[10];
+  x32[11] ^= 0xafffffff^r;
+  x32[12] = ~x32[12];
+  x32[13] ^= 0x9fffffff^r;
+  x32[14] = ~x32[14];
+  x32[15] ^= 0x8fffffff^r;
+  COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+/* compute compression function (short variants) */
+static void F512(uint32_t *h, const uint32_t *m) {
+  int i;
+  uint32_t Ptmp[2*COLS512];
+  uint32_t Qtmp[2*COLS512];
+  uint32_t y[2*COLS512];
+  uint32_t z[2*COLS512];
+
+  for (i = 0; i < 2*COLS512; i++) {
+    z[i] = m[i];
+    Ptmp[i] = h[i]^m[i];
+  }
+
+  /* compute Q(m) */
+  RND512Q((uint8_t*)z, y, 0x00000000);
+  RND512Q((uint8_t*)y, z, 0x01000000);
+  RND512Q((uint8_t*)z, y, 0x02000000);
+  RND512Q((uint8_t*)y, z, 0x03000000);
+  RND512Q((uint8_t*)z, y, 0x04000000);
+  RND512Q((uint8_t*)y, z, 0x05000000);
+  RND512Q((uint8_t*)z, y, 0x06000000);
+  RND512Q((uint8_t*)y, z, 0x07000000);
+  RND512Q((uint8_t*)z, y, 0x08000000);
+  RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+  /* compute P(h+m) */
+  RND512P((uint8_t*)Ptmp, y, 0x00000000);
+  RND512P((uint8_t*)y, z, 0x00000001);
+  RND512P((uint8_t*)z, y, 0x00000002);
+  RND512P((uint8_t*)y, z, 0x00000003);
+  RND512P((uint8_t*)z, y, 0x00000004);
+  RND512P((uint8_t*)y, z, 0x00000005);
+  RND512P((uint8_t*)z, y, 0x00000006);
+  RND512P((uint8_t*)y, z, 0x00000007);
+  RND512P((uint8_t*)z, y, 0x00000008);
+  RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+  /* compute P(h+m) + Q(m) + h */
+  for (i = 0; i < 2*COLS512; i++) {
+    h[i] ^= Ptmp[i]^Qtmp[i];
+  }
+}
+
+
+/* digest up to msglen bytes of input (full blocks only) */
+static void Transform(groestlHashState *ctx,
+	       const uint8_t *input, 
+	       int msglen) {
+
+  /* digest message, one block at a time */
+  for (; msglen >= SIZE512; 
+       msglen -= SIZE512, input += SIZE512) {
+    F512(ctx->chaining,(uint32_t*)input);
+
+    /* increment block counter */
+    ctx->block_counter1++;
+    if (ctx->block_counter1 == 0) ctx->block_counter2++;
+  }
+}
+
+/* given state h, do h <- P(h)+h */
+static void OutputTransformation(groestlHashState *ctx) {
+  int j;
+  uint32_t temp[2*COLS512];
+  uint32_t y[2*COLS512];
+  uint32_t z[2*COLS512];
+
+
+
+	for (j = 0; j < 2*COLS512; j++) {
+	  temp[j] = ctx->chaining[j];
+	}
+	RND512P((uint8_t*)temp, y, 0x00000000);
+	RND512P((uint8_t*)y, z, 0x00000001);
+	RND512P((uint8_t*)z, y, 0x00000002);
+	RND512P((uint8_t*)y, z, 0x00000003);
+	RND512P((uint8_t*)z, y, 0x00000004);
+	RND512P((uint8_t*)y, z, 0x00000005);
+	RND512P((uint8_t*)z, y, 0x00000006);
+	RND512P((uint8_t*)y, z, 0x00000007);
+	RND512P((uint8_t*)z, y, 0x00000008);
+	RND512P((uint8_t*)y, temp, 0x00000009);
+	for (j = 0; j < 2*COLS512; j++) {
+	  ctx->chaining[j] ^= temp[j];
+	}									  
+}
+
+/* initialise context */
+static void Init(groestlHashState* ctx) {
+  int i = 0;
+  /* allocate memory for state and data buffer */
+
+  for(;i<(SIZE512/sizeof(uint32_t));i++)
+  {
+	ctx->chaining[i] = 0;
+  }
+
+  /* set initial value */
+  ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter1 = 0;
+  ctx->block_counter2 = 0;
+  ctx->bits_in_last_byte = 0;
+}
+
+/* update state with databitlen bits of input */
+static void Update(groestlHashState* ctx,
+		  const BitSequence* input,
+		  DataLength databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->buf_ptr) {
+    while (ctx->buf_ptr < SIZE512 && index < msglen) {
+      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+    }
+    if (ctx->buf_ptr < SIZE512) {
+      /* buffer still not full, return */
+      if (rem) {
+	ctx->bits_in_last_byte = rem;
+	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+      }
+      return;
+    }
+
+    /* digest buffer */
+    ctx->buf_ptr = 0;
+    Transform(ctx, ctx->buffer, SIZE512);
+  }
+
+  /* digest bulk of message */
+  Transform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/SIZE512)*SIZE512;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->bits_in_last_byte = rem;
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+  }
+}
+
+#define BILB ctx->bits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+static void Final(groestlHashState* ctx,
+		 BitSequence* output) {
+  int i, j = 0, hashbytelen = HASH_BIT_LEN/8;
+  uint8_t *s = (BitSequence*)ctx->chaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->buf_ptr < SIZE512) {
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    Transform(ctx, ctx->buffer, SIZE512);
+    ctx->buf_ptr = 0;
+  }
+  while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) {
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->block_counter1++;
+  if (ctx->block_counter1 == 0) ctx->block_counter2++;
+  ctx->buf_ptr = SIZE512;
+
+  while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) {
+    ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+    ctx->block_counter1 >>= 8;
+  }
+  while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
+    ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+    ctx->block_counter2 >>= 8;
+  }
+  /* digest final padding block */
+  Transform(ctx, ctx->buffer, SIZE512); 
+  /* perform output transformation */
+  OutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  for (i = 0; i < COLS512; i++) {
+    ctx->chaining[i] = 0;
+  }
+  for (i = 0; i < SIZE512; i++) {
+    ctx->buffer[i] = 0;
+  }
+}
+
+/* hash bit sequence */
+void groestl(const BitSequence* data, 
+		DataLength databitlen,
+		BitSequence* hashval) {
+
+  groestlHashState context;
+
+  /* initialise */
+    Init(&context);
+
+
+  /* process message */
+  Update(&context, data, databitlen);
+
+  /* finalise */
+  Final(&context, hashval);
+}
+/*
+static int crypto_hash(unsigned char *out,
+		const unsigned char *in,
+		unsigned long long len)
+{
+  groestl(in, 8*len, out);
+  return 0;
+}
+
+*/
--- a/src/crypto/cn/c_groestl.h
+++ b/src/crypto/cn/c_groestl.h
@@ -0,0 +1,60 @@
+#ifndef __hash_h
+#define __hash_h
+/*
+#include "crypto_uint8.h"
+#include "crypto_uint32.h"
+#include "crypto_uint64.h"
+#include "crypto_hash.h" 
+
+typedef crypto_uint8 uint8_t; 
+typedef crypto_uint32 uint32_t; 
+typedef crypto_uint64 uint64_t;
+*/
+#include <stdint.h>
+
+#include "hash.h"
+
+/* some sizes (number of bytes) */
+#define ROWS 8
+#define LENGTHFIELDLEN ROWS
+#define COLS512 8
+
+#define SIZE512 (ROWS*COLS512)
+
+#define ROUNDS512 10
+#define HASH_BIT_LEN 256
+
+#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff))
+
+
+#define li_32(h) 0x##h##u
+#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+#define u32BIG(a)				\
+  ((ROTL32(a,8) & li_32(00FF00FF)) |		\
+   (ROTL32(a,24) & li_32(FF00FF00)))
+
+
+/* NIST API begin */
+typedef struct {
+  uint32_t chaining[SIZE512/sizeof(uint32_t)];            /* actual state */
+  uint32_t block_counter1,
+  block_counter2;         /* message block counter(s) */
+  BitSequence buffer[SIZE512];      /* data buffer */
+  int buf_ptr;              /* data buffer pointer */
+  int bits_in_last_byte;    /* no. of message bits in last byte of
+			       data buffer */
+} groestlHashState;
+
+/*void Init(hashState*);
+void Update(hashState*, const BitSequence*, DataLength);
+void Final(hashState*, BitSequence*); */
+void groestl(const BitSequence*, DataLength, BitSequence*);
+/* NIST API end   */
+
+/*
+int crypto_hash(unsigned char *out,
+		const unsigned char *in,
+		unsigned long long len);
+*/
+
+#endif /* __hash_h */
--- a/src/crypto/cn/c_jh.c
+++ b/src/crypto/cn/c_jh.c
@@ -0,0 +1,367 @@
+/*This program gives the 64-bit optimized bitslice implementation of JH using ANSI C
+
+   --------------------------------
+   Performance
+
+   Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
+   Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
+   Speed for long message:
+   1) 45.8 cycles/byte   compiler: Intel C++ Compiler 11.1   compilation option: icc -O2
+   2) 56.8 cycles/byte   compiler: gcc 4.4.3                 compilation option: gcc -O3
+
+   --------------------------------
+   Last Modified: January 16, 2011
+*/
+
+#include "c_jh.h"
+
+#include <stdint.h>
+#include <string.h>
+
+/*typedef unsigned long long uint64;*/
+typedef uint64_t uint64;
+
+/*define data alignment for different C compilers*/
+#if defined(__GNUC__)
+      #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
+#else
+      #define DATA_ALIGN16(x) __declspec(align(16)) x
+#endif
+
+
+typedef struct {
+	int hashbitlen;	   	              /*the message digest size*/
+	unsigned long long databitlen;    /*the message size in bits*/
+	unsigned long long datasize_in_buffer;      /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
+	DATA_ALIGN16(uint64 x[8][2]);     /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
+	unsigned char buffer[64];         /*the 512-bit message block to be hashed;*/
+} hashState;
+
+
+/*The initial hash value H(0)*/
+const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
+const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
+const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
+const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
+
+/*42 round constants, each round constant is 32-byte (256-bit)*/
+const unsigned char E8_bitslice_roundconstant[42][32]={
+{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
+{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
+{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
+{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
+{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
+{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
+{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
+{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
+{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
+{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
+{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
+{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
+{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
+{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
+{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
+{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
+{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
+{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
+{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
+{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
+{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
+{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
+{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
+{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
+{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
+{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
+{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
+{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
+{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
+{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
+{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
+{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
+{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
+{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
+{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
+{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
+{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
+{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
+{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
+{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
+{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
+{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
+
+
+static void E8(hashState *state);  /*The bijective function E8, in bitslice form*/
+static void F8(hashState *state);  /*The compression function F8 */
+
+/*The API functions*/
+static HashReturn Init(hashState *state, int hashbitlen);
+static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+static HashReturn Final(hashState *state, BitSequence *hashval);
+HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
+
+/*swapping bit 2i with bit 2i+1 of 64-bit x*/
+#define SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+/*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/
+#define SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/
+#define SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+/*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/
+#define SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/
+#define SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+/*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/
+#define SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+
+/*The MDS transform*/
+#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
+      (m4) ^= (m1);                \
+      (m5) ^= (m2);                \
+      (m6) ^= (m0) ^ (m3);         \
+      (m7) ^= (m0);                \
+      (m0) ^= (m5);                \
+      (m1) ^= (m6);                \
+      (m2) ^= (m4) ^ (m7);         \
+      (m3) ^= (m4);
+
+/*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
+/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/
+#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
+      m3  = ~(m3);                  \
+      m7  = ~(m7);                  \
+      m0 ^= ((~(m2)) & (cc0));      \
+      m4 ^= ((~(m6)) & (cc1));      \
+      temp0 = (cc0) ^ ((m0) & (m1));\
+      temp1 = (cc1) ^ ((m4) & (m5));\
+      m0 ^= ((m2) & (m3));          \
+      m4 ^= ((m6) & (m7));          \
+      m3 ^= ((~(m1)) & (m2));       \
+      m7 ^= ((~(m5)) & (m6));       \
+      m1 ^= ((m0) & (m2));          \
+      m5 ^= ((m4) & (m6));          \
+      m2 ^= ((m0) & (~(m3)));       \
+      m6 ^= ((m4) & (~(m7)));       \
+      m0 ^= ((m1) | (m3));          \
+      m4 ^= ((m5) | (m7));          \
+      m3 ^= ((m1) & (m2));          \
+      m7 ^= ((m5) & (m6));          \
+      m1 ^= (temp0 & (m0));         \
+      m5 ^= (temp1 & (m4));         \
+      m2 ^= temp0;                  \
+      m6 ^= temp1;
+
+/*The bijective function E8, in bitslice form*/
+static void E8(hashState *state)
+{
+      uint64 i,roundnumber,temp0,temp1;
+
+      for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) {
+            /*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+                  SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]);
+            }
+
+            /*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+                  SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]);
+            }
+
+            /*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+                  SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]);
+            }
+
+            /*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+                  SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]);
+            }
+
+            /*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+                  SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]);
+            }
+
+            /*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+                  SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]);
+            }
+
+            /*round 7*roundnumber+6: Sbox and MDS layers*/
+            for (i = 0; i < 2; i++) {
+                  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] );
+                  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+            }
+            /*round 7*roundnumber+6: swapping layer*/
+            for (i = 1; i < 8; i = i+2) {
+                  temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
+            }
+      }
+
+}
+
+/*The compression function F8 */
+static void F8(hashState *state)
+{
+      uint64  i;
+
+      /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
+      for (i = 0; i < 8; i++)  state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
+
+      /*the bijective function E8 */
+      E8(state);
+
+      /*xor the 512-bit message with the second half of the 1024-bit hash state*/
+      for (i = 0; i < 8; i++)  state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i];
+}
+
+/*before hashing a message, initialize the hash state as H0 */
+static HashReturn Init(hashState *state, int hashbitlen)
+{
+	  state->databitlen = 0;
+	  state->datasize_in_buffer = 0;
+
+      /*initialize the initial hash value of JH*/
+      state->hashbitlen = hashbitlen;
+
+      /*load the intital hash value into state*/
+      switch (hashbitlen)
+      {
+            case 224: memcpy(state->x,JH224_H0,128); break;
+            case 256: memcpy(state->x,JH256_H0,128); break;
+            case 384: memcpy(state->x,JH384_H0,128); break;
+            case 512: memcpy(state->x,JH512_H0,128); break;
+      }
+
+      return(SUCCESS);
+}
+
+
+/*hash each 512-bit message block, except the last partial block*/
+static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+{
+      DataLength index; /*the starting address of the data to be compressed*/
+
+      state->databitlen += databitlen;
+      index = 0;
+
+      /*if there is remaining data in the buffer, fill it to a full message block first*/
+      /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
+
+      /*There is data in the buffer, but the incoming data is insufficient for a full block*/
+      if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) {
+            if ( (databitlen & 7) == 0 ) {
+                 memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
+		    }
+            else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
+            state->datasize_in_buffer += databitlen;
+            databitlen = 0;
+      }
+
+      /*There is data in the buffer, and the incoming data is sufficient for a full block*/
+      if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
+	        memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
+	        index = 64-(state->datasize_in_buffer >> 3);
+	        databitlen = databitlen - (512 - state->datasize_in_buffer);
+	        F8(state);
+	        state->datasize_in_buffer = 0;
+      }
+
+      /*hash the remaining full message blocks*/
+      for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
+            memcpy(state->buffer, data+index, 64);
+            F8(state);
+      }
+
+      /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
+      if ( databitlen > 0) {
+            if ((databitlen & 7) == 0)
+                  memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+            else
+                  memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+            state->datasize_in_buffer = databitlen;
+      }
+
+      return(SUCCESS);
+}
+
+/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
+static HashReturn Final(hashState *state, BitSequence *hashval)
+{
+      unsigned int i;
+
+      if ( (state->databitlen & 0x1ff) == 0 ) {
+            /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
+            memset(state->buffer, 0, 64);
+            state->buffer[0]  = 0x80;
+            state->buffer[63] = state->databitlen & 0xff;
+            state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+            state->buffer[61] = (state->databitlen >> 16) & 0xff;
+            state->buffer[60] = (state->databitlen >> 24) & 0xff;
+            state->buffer[59] = (state->databitlen >> 32) & 0xff;
+            state->buffer[58] = (state->databitlen >> 40) & 0xff;
+            state->buffer[57] = (state->databitlen >> 48) & 0xff;
+            state->buffer[56] = (state->databitlen >> 56) & 0xff;
+            F8(state);
+      }
+      else {
+		    /*set the rest of the bytes in the buffer to 0*/
+            if ( (state->datasize_in_buffer & 7) == 0)
+                  for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)  state->buffer[i] = 0;
+            else
+                  for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)  state->buffer[i] = 0;
+
+            /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+            state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+
+            F8(state);
+            memset(state->buffer, 0, 64);
+            state->buffer[63] = state->databitlen & 0xff;
+            state->buffer[62] = (state->databitlen >> 8) & 0xff;
+            state->buffer[61] = (state->databitlen >> 16) & 0xff;
+            state->buffer[60] = (state->databitlen >> 24) & 0xff;
+            state->buffer[59] = (state->databitlen >> 32) & 0xff;
+            state->buffer[58] = (state->databitlen >> 40) & 0xff;
+            state->buffer[57] = (state->databitlen >> 48) & 0xff;
+            state->buffer[56] = (state->databitlen >> 56) & 0xff;
+            F8(state);
+      }
+
+      /*truncating the final hash value to generate the message digest*/
+      switch(state->hashbitlen) {
+            case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28);  break;
+            case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32);  break;
+            case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48);  break;
+            case 512: memcpy(hashval,(unsigned char*)state->x+64,64);     break;
+      }
+
+      return(SUCCESS);
+}
+
+/* hash a message,
+   three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
+   one output:   message digest (hashval)
+*/
+HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
+{
+      hashState state;
+
+      if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) {
+            Init(&state, hashbitlen);
+            Update(&state, data, databitlen);
+            Final(&state, hashval);
+            return SUCCESS;
+      }
+      else
+            return(BAD_HASHLEN);
+}
--- a/src/crypto/cn/c_jh.h
+++ b/src/crypto/cn/c_jh.h
@@ -0,0 +1,19 @@
+/*This program gives the 64-bit optimized bitslice implementation of JH using ANSI C
+
+   --------------------------------
+   Performance
+
+   Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
+   Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
+   Speed for long message:
+   1) 45.8 cycles/byte   compiler: Intel C++ Compiler 11.1   compilation option: icc -O2
+   2) 56.8 cycles/byte   compiler: gcc 4.4.3                 compilation option: gcc -O3
+
+   --------------------------------
+   Last Modified: January 16, 2011
+*/
+#pragma once
+
+#include "hash.h"
+
+HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
--- a/src/crypto/cn/c_skein.c
+++ b/src/crypto/cn/c_skein.c
@@ -0,0 +1,701 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+
+#include <stddef.h>                          /* get size_t definition */
+#include <string.h>      /* get the memcpy/memset functions */
+#include "c_skein.h"       /* get the Skein API definitions   */
+
+#ifndef SKEIN_512_NIST_MAX_HASHBITS
+#define SKEIN_512_NIST_MAX_HASHBITS (512)
+#endif
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define  SKEIN_512_STATE_WORDS ( 8)
+#define  SKEIN_MAX_STATE_WORDS (16)
+
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+
+#define SKEIN_RND_SPECIAL       (1000u)
+#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
+#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
+#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+
+typedef struct
+{
+  size_t  hashBitLen;                      /* size of hash result, in bits */
+  size_t  bCnt;                            /* current byte count in buffer b[] */
+  u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+} Skein_Ctxt_Hdr_t;
+
+typedef struct                               /*  512-bit Skein hash context structure */
+{
+  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+  u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
+  u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+} Skein_512_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+static int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+static int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+
+#ifndef SKEIN_TREE_HASH
+#define SKEIN_TREE_HASH (1)
+#endif
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be 
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
+#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
+#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION           (1)
+
+#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN       (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
+  ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+  (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
+  (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+
+/*
+**   Skein macros for getting/setting tweak words, etc.
+**   These are useful for partial input bytes, hash tree init/update, etc.
+**/
+#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
+#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
+{                                           \
+  Skein_Set_T0(ctxPtr,(T0));                  \
+  Skein_Set_T1(ctxPtr,(T1));                  \
+}
+
+#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
+  Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+
+/*****************************************************************
+** "Internal" Skein definitions for debugging and error checking
+******************************************************************/
+#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
+#define Skein_Show_Round(bits,ctx,r,X)
+#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
+#define Skein_Show_Final(bits,ctx,cnt,outPtr)
+#define Skein_Show_Key(bits,ctx,key,keyBytes)
+
+
+#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_assert(x)
+#elif   defined(SKEIN_ASSERT)
+#include <assert.h>     
+#define Skein_Assert(x,retCode) assert(x) 
+#define Skein_assert(x)         assert(x) 
+#else
+#include <assert.h>     
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+#endif
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum    
+{   
+  /* Skein_512 round rotation constants */
+  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+  R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+  R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+  R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+  R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+  R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+  R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+};
+
+#ifndef SKEIN_ROUNDS
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#else                                        /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
+#endif
+
+
+/*
+***************** Pre-computed Skein IVs *******************
+**
+** NOTE: these values are not "magic" constants, but
+** are generated using the Threefish block function.
+** They are pre-computed here only for speed; i.e., to
+** avoid the need for a Threefish call during Init().
+**
+** The IV for any fixed hash length may be pre-computed.
+** Only the most common values are included here.
+**
+************************************************************
+**/
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const u64b_t SKEIN_512_IV_256[] =
+    {
+    MK_64(0xCCD044A1,0x2FDB3E13),
+    MK_64(0xE8359030,0x1A79A9EB),
+    MK_64(0x55AEA061,0x4F816E6F),
+    MK_64(0x2A2767A4,0xAE9B94DB),
+    MK_64(0xEC06025E,0x74DD7683),
+    MK_64(0xE7A436CD,0xC4746251),
+    MK_64(0xC36FBAF9,0x393AD185),
+    MK_64(0x3EEDBA18,0x33EDFC13)
+    };
+
+#ifndef SKEIN_USE_ASM
+#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
+#endif
+
+#ifndef SKEIN_LOOP
+#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
+#endif
+
+#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)                
+#define ts              (kw + KW_TWK_BASE)
+
+#ifdef SKEIN_DEBUG
+#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#else
+#define DebugSaveTweak(ctx)
+#endif
+
+/*****************************  Skein_512 ******************************/
+#if !(SKEIN_USE_ASM & 512)
+static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN_512_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#else
+#define SKEIN_UNROLL_512 (0)
+#endif
+
+#if SKEIN_UNROLL_512
+#if (RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+    u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
+    Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+    Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
+#endif
+
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[0] = ctx->X[0];
+        ks[1] = ctx->X[1];
+        ks[2] = ctx->X[2];
+        ks[3] = ctx->X[3];
+        ks[4] = ctx->X[4];
+        ks[5] = ctx->X[5];
+        ks[6] = ctx->X[6];
+        ks[7] = ctx->X[7];
+        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X0   = w[0] + ks[0];                    /* do the first full key injection */
+        X1   = w[1] + ks[1];
+        X2   = w[2] + ks[2];
+        X3   = w[3] + ks[3];
+        X4   = w[4] + ks[4];
+        X5   = w[5] + ks[5] + ts[0];
+        X6   = w[6] + ks[6] + ts[1];
+        X7   = w[7] + ks[7];
+
+        blkPtr += SKEIN_512_BLOCK_BYTES;
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+        /* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#if SKEIN_UNROLL_512 == 0                       
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I512(R)                                                     \
+    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
+    X1   += ks[((R)+2) % 9];                                        \
+    X2   += ks[((R)+3) % 9];                                        \
+    X3   += ks[((R)+4) % 9];                                        \
+    X4   += ks[((R)+5) % 9];                                        \
+    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
+    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
+    X7   += ks[((R)+8) % 9] +     (R)+1;                            \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I512(R)                                                     \
+    X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+    X1   += ks[r+(R)+1];                                            \
+    X2   += ks[r+(R)+2];                                            \
+    X3   += ks[r+(R)+3];                                            \
+    X4   += ks[r+(R)+4];                                            \
+    X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
+    X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
+    X7   += ks[r+(R)+7] +    r+(R)   ;                              \
+    ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
+    ts[r +       (R)+2] = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512)   /* loop thru it */
+#endif                         /* end of looped code definitions */
+        {
+#define R512_8_rounds(R)  /* do 8 full rounds */  \
+        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
+        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
+        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
+        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
+        I512(2*(R));                              \
+        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
+        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
+        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
+        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
+        I512(2*(R)+1);        /* and key injection */
+
+        R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+  #if   R512_Unroll_R( 1)
+        R512_8_rounds( 1);
+  #endif
+  #if   R512_Unroll_R( 2)
+        R512_8_rounds( 2);
+  #endif
+  #if   R512_Unroll_R( 3)
+        R512_8_rounds( 3);
+  #endif
+  #if   R512_Unroll_R( 4)
+        R512_8_rounds( 4);
+  #endif
+  #if   R512_Unroll_R( 5)
+        R512_8_rounds( 5);
+  #endif
+  #if   R512_Unroll_R( 6)
+        R512_8_rounds( 6);
+  #endif
+  #if   R512_Unroll_R( 7)
+        R512_8_rounds( 7);
+  #endif
+  #if   R512_Unroll_R( 8)
+        R512_8_rounds( 8);
+  #endif
+  #if   R512_Unroll_R( 9)
+        R512_8_rounds( 9);
+  #endif
+  #if   R512_Unroll_R(10)
+        R512_8_rounds(10);
+  #endif
+  #if   R512_Unroll_R(11)
+        R512_8_rounds(11);
+  #endif
+  #if   R512_Unroll_R(12)
+        R512_8_rounds(12);
+  #endif
+  #if   R512_Unroll_R(13)
+        R512_8_rounds(13);
+  #endif
+  #if   R512_Unroll_R(14)
+        R512_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_512 > 14)
+#error  "need more unrolling in Skein_512_Process_Block"
+  #endif
+        }
+
+        /* do the final "feedforward" xor, update context chaining vars */
+        ctx->X[0] = X0 ^ w[0];
+        ctx->X[1] = X1 ^ w[1];
+        ctx->X[2] = X2 ^ w[2];
+        ctx->X[3] = X3 ^ w[3];
+        ctx->X[4] = X4 ^ w[4];
+        ctx->X[5] = X5 ^ w[5];
+        ctx->X[6] = X6 ^ w[6];
+        ctx->X[7] = X7 ^ w[7];
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+#endif
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X));  break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+            Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+            msg        += n * SKEIN_512_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein_512_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_API_CodeSize) -
+           ((u08b_t *) Skein_512_Init);
+    }
+#endif
+
+typedef struct
+{
+  uint_t  statebits;                      /* 256, 512, or 1024 */
+  union
+  {
+    Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
+    Skein_512_Ctxt_t ctx_512;
+  } u;
+}
+hashState;
+
+/* "incremental" hashing API */
+static SkeinHashReturn Init  (hashState *state, int hashbitlen);
+static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen);
+static SkeinHashReturn Final (hashState *state,       SkeinBitSequence *hashval);
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+static SkeinHashReturn Init(hashState *state, int hashbitlen)
+{
+    state->statebits = 64*SKEIN_512_STATE_WORDS;
+    return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen)
+{
+  /* only the final Update() call is allowed do partial bytes, else assert an error */
+  Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
+
+  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL);
+  if ((databitlen & 7) == 0)  /* partial bytes? */
+  {
+    return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+  }
+  else
+  {   /* handle partial final byte */
+    size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
+    u08b_t b,mask;
+
+    mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
+    b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+
+    Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
+    Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
+    Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+
+    return SKEIN_SUCCESS;
+  }
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval)
+{
+  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+  return Skein_512_Final(&state->u.ctx_512,hashval);
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */
+                SkeinDataLength databitlen,SkeinBitSequence *hashval)
+{
+  hashState  state;
+  SkeinHashReturn r = Init(&state,hashbitlen);
+  if (r == SKEIN_SUCCESS)
+  { /* these calls do not fail when called properly */
+    r = Update(&state,data,databitlen);
+    Final(&state,hashval);
+  }
+  return r;
+}
+
+void xmr_skein(const SkeinBitSequence *data, SkeinBitSequence *hashval){
+  #define XMR_HASHBITLEN 256
+  #define XMR_DATABITLEN 1600
+
+  // Init
+  hashState  state;
+  state.statebits = 64*SKEIN_512_STATE_WORDS;
+
+  // Skein_512_Init(&state.u.ctx_512, (size_t)XMR_HASHBITLEN);
+  state.u.ctx_512.h.hashBitLen = XMR_HASHBITLEN;
+  memcpy(state.u.ctx_512.X,SKEIN_512_IV_256,sizeof(state.u.ctx_512.X));
+  Skein_512_Ctxt_t* ctx = &(state.u.ctx_512);
+  Skein_Start_New_Type(ctx,MSG);
+
+  // Update
+  if ((XMR_DATABITLEN & 7) == 0){  /* partial bytes? */
+    Skein_512_Update(&state.u.ctx_512,data,XMR_DATABITLEN >> 3);
+  }else{   /* handle partial final byte */
+    size_t bCnt = (XMR_DATABITLEN >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
+    u08b_t b,mask;
+
+    mask = (u08b_t) (1u << (7 - (XMR_DATABITLEN & 7)));       /* partial byte bit mask */
+    b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+
+    Skein_512_Update(&state.u.ctx_512,data,bCnt-1); /* process all but the final byte    */
+    Skein_512_Update(&state.u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
+    Skein_Set_Bit_Pad_Flag(state.u.h);                    /* set tweak flag for the final call */
+  }
+
+  // Finalize
+  Skein_512_Final(&state.u.ctx_512, hashval);
+}
--- a/src/crypto/cn/c_skein.h
+++ b/src/crypto/cn/c_skein.h
@@ -0,0 +1,49 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+** 
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_DEBUG            -- make callouts from inside Skein code
+**                            to examine/display intermediate values.
+**                            [default: no callouts (no overhead)]
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking 
+**                            is disabled (for performance). Otherwise, 
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#include "skein_port.h"                      /* get platform-specific definitions */
+
+typedef enum
+{
+  SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+  SKEIN_FAIL            =      1,
+  SKEIN_BAD_HASHLEN     =      2
+}
+SkeinHashReturn;
+
+typedef size_t   SkeinDataLength;                /* bit count  type */
+typedef u08b_t   SkeinBitSequence;               /* bit stream type */
+
+/* "all-in-one" call */
+SkeinHashReturn skein_hash(int hashbitlen,   const SkeinBitSequence *data,
+        SkeinDataLength databitlen, SkeinBitSequence *hashval);
+
+void xmr_skein(const SkeinBitSequence *data, SkeinBitSequence *hashval);
+
+#endif  /* ifndef _SKEIN_H_ */
--- a/src/crypto/cn/gpu/cn_gpu_arm.cpp
+++ b/src/crypto/cn/gpu/cn_gpu_arm.cpp
@@ -0,0 +1,240 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <arm_neon.h>
+
+
+#include "crypto/CryptoNight_constants.h"
+
+
+inline void vandq_f32(float32x4_t &v, uint32_t v2)
+{
+    uint32x4_t vc = vdupq_n_u32(v2);
+    v = (float32x4_t)vandq_u32((uint32x4_t)v, vc);
+}
+
+
+inline void vorq_f32(float32x4_t &v, uint32_t v2)
+{
+    uint32x4_t vc = vdupq_n_u32(v2);
+    v = (float32x4_t)vorrq_u32((uint32x4_t)v, vc);
+}
+
+
+template <size_t v>
+inline void vrot_si32(int32x4_t &r)
+{
+    r = (int32x4_t)vextq_s8((int8x16_t)r, (int8x16_t)r, v);
+}
+
+template <>
+inline void vrot_si32<0>(int32x4_t &r)
+{
+}
+
+
+inline uint32_t vheor_s32(const int32x4_t &v)
+{
+    int32x4_t v0 = veorq_s32(v, vrev64q_s32(v));
+    int32x2_t vf = veor_s32(vget_high_s32(v0), vget_low_s32(v0));
+    return (uint32_t)vget_lane_s32(vf, 0);
+}
+
+
+inline void prep_dv(int32_t *idx, int32x4_t &v, float32x4_t &n)
+{
+    v = vld1q_s32(idx);
+    n = vcvtq_f32_s32(v);
+}
+
+
+inline void sub_round(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, const float32x4_t &rnd_c, float32x4_t &n, float32x4_t &d, float32x4_t &c)
+{
+    float32x4_t ln1 = vaddq_f32(n1, c);
+    float32x4_t nn = vmulq_f32(n0, c);
+    nn = vmulq_f32(ln1, vmulq_f32(nn, nn));
+    vandq_f32(nn, 0xFEFFFFFF);
+    vorq_f32(nn, 0x00800000);
+    n = vaddq_f32(n, nn);
+
+    float32x4_t ln3 = vsubq_f32(n3, c);
+    float32x4_t dd = vmulq_f32(n2, c);
+    dd = vmulq_f32(ln3, vmulq_f32(dd, dd));
+    vandq_f32(dd, 0xFEFFFFFF);
+    vorq_f32(dd, 0x00800000);
+    d = vaddq_f32(d, dd);
+
+    //Constant feedback
+    c = vaddq_f32(c, rnd_c);
+    c = vaddq_f32(c, vdupq_n_f32(0.734375f));
+    float32x4_t r = vaddq_f32(nn, dd);
+    vandq_f32(r, 0x807FFFFF);
+    vorq_f32(r, 0x40000000);
+    c = vaddq_f32(c, r);
+}
+
+
+inline void round_compute(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, const float32x4_t &rnd_c, float32x4_t &c, float32x4_t &r)
+{
+    float32x4_t n = vdupq_n_f32(0.0f), d = vdupq_n_f32(0.0f);
+
+    sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+    sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+    sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+    sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+    sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+    sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+    sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+    sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+    // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+    vandq_f32(d, 0xFF7FFFFF);
+    vorq_f32(d, 0x40000000);
+    r = vaddq_f32(r, vdivq_f32(n, d));
+}
+
+
+// 112×4 = 448
+template <bool add>
+inline int32x4_t single_compute(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, float cnt, const float32x4_t &rnd_c, float32x4_t &sum)
+{
+    float32x4_t c = vdupq_n_f32(cnt);
+    float32x4_t r = vdupq_n_f32(0.0f);
+
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+    // do a quick fmod by setting exp to 2
+    vandq_f32(r, 0x807FFFFF);
+    vorq_f32(r, 0x40000000);
+
+    if (add) {
+        sum = vaddq_f32(sum, r);
+    } else {
+        sum = r;
+    }
+
+    const float32x4_t cc2 = vdupq_n_f32(536870880.0f);
+    r = vmulq_f32(r, cc2); // 35
+    return vcvtq_s32_f32(r);
+}
+
+
+template<size_t rot>
+inline void single_compute_wrap(const float32x4_t &n0, const float32x4_t &n1, const float32x4_t &n2, const float32x4_t &n3, float cnt, const float32x4_t &rnd_c, float32x4_t &sum, int32x4_t &out)
+{
+    int32x4_t r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
+    vrot_si32<rot>(r);
+    out = veorq_s32(out, r);
+}
+
+
+template<uint32_t MASK>
+inline int32_t *scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<int32_t *>(lpad + (idx & MASK) + n * 16); }
+
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_arm(const uint8_t *spad, uint8_t *lpad)
+{
+    uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+    int32_t *idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
+    int32_t *idx1 = scratchpad_ptr<MASK>(lpad, s, 1);
+    int32_t *idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
+    int32_t *idx3 = scratchpad_ptr<MASK>(lpad, s, 3);
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+
+    for (size_t i = 0; i < ITER; i++) {
+        float32x4_t n0, n1, n2, n3;
+        int32x4_t v0, v1, v2, v3;
+        float32x4_t suma, sumb, sum1, sum2, sum3;
+
+        prep_dv(idx0, v0, n0);
+        prep_dv(idx1, v1, n1);
+        prep_dv(idx2, v2, n2);
+        prep_dv(idx3, v3, n3);
+        float32x4_t rc = sum0;
+
+        int32x4_t out, out2;
+        out = vdupq_n_s32(0);
+        single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
+        single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
+        single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
+        single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
+        sum0 = vaddq_f32(suma, sumb);
+        vst1q_s32(idx0, veorq_s32(v0, out));
+        out2 = out;
+
+        out = vdupq_n_s32(0);
+        single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
+        single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
+        single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
+        single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
+        sum1 = vaddq_f32(suma, sumb);
+        vst1q_s32(idx1, veorq_s32(v1, out));
+        out2 = veorq_s32(out2, out);
+
+        out = vdupq_n_s32(0);
+        single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
+        single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
+        single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
+        single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
+        sum2 = vaddq_f32(suma, sumb);
+        vst1q_s32(idx2, veorq_s32(v2, out));
+        out2 = veorq_s32(out2, out);
+
+        out = vdupq_n_s32(0);
+        single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
+        single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
+        single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
+        single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
+        sum3 = vaddq_f32(suma, sumb);
+        vst1q_s32(idx3, veorq_s32(v3, out));
+        out2 = veorq_s32(out2, out);
+
+        sum0 = vaddq_f32(sum0, sum1);
+        sum2 = vaddq_f32(sum2, sum3);
+        sum0 = vaddq_f32(sum0, sum2);
+
+        const float32x4_t cc1 = vdupq_n_f32(16777216.0f);
+        const float32x4_t cc2 = vdupq_n_f32(64.0f);
+        vandq_f32(sum0, 0x7fffffff); // take abs(va) by masking the float sign bit
+        // vs range 0 - 64
+        n0 = vmulq_f32(sum0, cc1);
+        v0 = vcvtq_s32_f32(n0);
+        v0 = veorq_s32(v0, out2);
+        uint32_t n = vheor_s32(v0);
+
+        // vs is now between 0 and 1
+        sum0 = vdivq_f32(sum0, cc2);
+        idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
+        idx1 = scratchpad_ptr<MASK>(lpad, n, 1);
+        idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
+        idx3 = scratchpad_ptr<MASK>(lpad, n, 3);
+    }
+}
+
+template void cn_gpu_inner_arm<xmrig::CRYPTONIGHT_GPU_ITER, xmrig::CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
--- a/src/crypto/cn/gpu/cn_gpu_avx.cpp
+++ b/src/crypto/cn/gpu/cn_gpu_avx.cpp
@@ -0,0 +1,209 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "crypto/cn/CryptoNight_constants.h"
+
+#ifdef __GNUC__
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#   define __restrict__ __restrict
+#endif
+#ifndef _mm256_bslli_epi128
+	#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+#endif
+#ifndef _mm256_bsrli_epi128
+	#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+#endif
+
+inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
+{
+    v = _mm256_load_si256(idx);
+    n01 = _mm256_cvtepi32_ps(v);
+}
+
+inline __m256 fma_break(const __m256& x) 
+{ 
+    // Break the dependency chain by setitng the exp to ?????01 
+    __m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x); 
+    return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx); 
+}
+
+// 14
+inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c)
+{
+    __m256 nn = _mm256_mul_ps(n0, c);
+    nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn));
+    nn = fma_break(nn);
+    n = _mm256_add_ps(n, nn);
+
+    __m256 dd = _mm256_mul_ps(n2, c);
+    dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd));
+    dd = fma_break(dd);
+    d = _mm256_add_ps(d, dd);
+
+    //Constant feedback
+    c = _mm256_add_ps(c, rnd_c);
+    c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f));
+    __m256 r = _mm256_add_ps(nn, dd);
+    r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
+    r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
+    c = _mm256_add_ps(c, r);
+}
+
+// 14*8 + 2 = 112
+inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r)
+{
+    __m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps();
+
+    sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+    sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+    sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+    sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+    sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+    sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+    sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+    sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+    // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+    d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d);
+    d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d);
+    r = _mm256_add_ps(r, _mm256_div_ps(n, d));
+}
+
+// 112×4 = 448
+template <bool add>
+inline __m256i double_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
+                              float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
+{
+    __m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
+    __m256 r = _mm256_setzero_ps();
+
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+    // do a quick fmod by setting exp to 2
+    r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
+    r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
+
+    if(add)
+        sum = _mm256_add_ps(sum, r);
+    else
+        sum = r;
+
+    r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35
+    return _mm256_cvttps_epi32(r);
+}
+
+template <size_t rot>
+inline void double_compute_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
+                                float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
+{
+    __m256i r = double_compute<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
+    if(rot != 0)
+        r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot));
+
+    out = _mm256_xor_si256(out, r);
+}
+
+template<uint32_t MASK>
+inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m256i*>(lpad + (idx & MASK) + n*16); }
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
+{
+    uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+    __m256i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
+    __m256i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
+    __m256 sum0 = _mm256_setzero_ps();
+
+    for(size_t i = 0; i < ITER; i++)
+    {
+        __m256i v01, v23;
+        __m256 suma, sumb, sum1;
+        __m256 rc = sum0;
+
+        __m256 n01, n23;
+        prep_dv_avx(idx0, v01, n01);
+        prep_dv_avx(idx2, v23, n23);
+        
+        __m256i out, out2;
+        __m256 n10, n22, n33;
+        n10 = _mm256_permute2f128_ps(n01, n01, 0x01);
+        n22 = _mm256_permute2f128_ps(n23, n23, 0x00);
+        n33 = _mm256_permute2f128_ps(n23, n23, 0x11);
+        
+        out = _mm256_setzero_si256();
+        double_compute_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out);
+        double_compute_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out);
+        double_compute_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out);
+        double_compute_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out);
+        _mm256_store_si256(idx0, _mm256_xor_si256(v01, out));
+        sum0 = _mm256_add_ps(suma, sumb);
+        out2 = out;
+        
+        __m256 n11, n02, n30;
+        n11 = _mm256_permute2f128_ps(n01, n01, 0x11);
+        n02 = _mm256_permute2f128_ps(n01, n23, 0x20);
+        n30 = _mm256_permute2f128_ps(n01, n23, 0x03);
+
+        out = _mm256_setzero_si256();
+        double_compute_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out);
+        double_compute_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out);
+        double_compute_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out);
+        double_compute_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out);
+        _mm256_store_si256(idx2, _mm256_xor_si256(v23, out));
+        sum1 = _mm256_add_ps(suma, sumb);
+
+        out2 = _mm256_xor_si256(out2, out);
+        out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2);
+        suma = _mm256_permute2f128_ps(sum0, sum1, 0x30);
+        sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21);
+        sum0 = _mm256_add_ps(suma, sumb);
+        sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41));
+
+        // Clear the high 128 bits
+        __m128 sum = _mm256_castps256_ps128(sum0);
+
+        sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit
+        // vs range 0 - 64 
+        __m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f)));
+        v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2));
+        __m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
+        v0 = _mm_xor_si128(v0, v1);
+        v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
+        v0 = _mm_xor_si128(v0, v1);
+
+        // vs is now between 0 and 1
+        sum = _mm_div_ps(sum, _mm_set1_ps(64.0f));
+        sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1);
+        uint32_t n = _mm_cvtsi128_si32(v0);
+        idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
+        idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
+    }
+}
+
+template void cn_gpu_inner_avx<xmrig::CRYPTONIGHT_GPU_ITER, xmrig::CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
--- a/src/crypto/cn/gpu/cn_gpu_ssse3.cpp
+++ b/src/crypto/cn/gpu/cn_gpu_ssse3.cpp
@@ -0,0 +1,210 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "crypto/cn/CryptoNight_constants.h"
+
+#ifdef __GNUC__
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#   define __restrict__ __restrict
+#endif
+
+inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
+{
+    v = _mm_load_si128(idx);
+    n = _mm_cvtepi32_ps(v);
+}
+
+inline __m128 fma_break(__m128 x) 
+{ 
+    // Break the dependency chain by setitng the exp to ?????01 
+    x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x); 
+    return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x); 
+}
+
+// 14
+inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
+{
+    n1 = _mm_add_ps(n1, c);
+    __m128 nn = _mm_mul_ps(n0, c);
+    nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+    nn = fma_break(nn);
+    n = _mm_add_ps(n, nn);
+
+    n3 = _mm_sub_ps(n3, c);
+    __m128 dd = _mm_mul_ps(n2, c);
+    dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+    dd = fma_break(dd);
+    d = _mm_add_ps(d, dd);
+
+    //Constant feedback
+    c = _mm_add_ps(c, rnd_c);
+    c = _mm_add_ps(c, _mm_set1_ps(0.734375f));
+    __m128 r = _mm_add_ps(nn, dd);
+    r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
+    r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
+    c = _mm_add_ps(c, r);
+}
+
+// 14*8 + 2 = 112
+inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
+{
+    __m128 n = _mm_setzero_ps(), d = _mm_setzero_ps();
+
+    sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+    sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+    sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+    sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+    sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+    sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+    sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+    sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+    // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+    d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d);
+    d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d);
+    r =_mm_add_ps(r, _mm_div_ps(n,d));
+}
+
+// 112×4 = 448
+template<bool add>
+inline __m128i single_compute(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+{
+    __m128 c = _mm_set1_ps(cnt);
+    __m128 r = _mm_setzero_ps();
+
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+    // do a quick fmod by setting exp to 2
+    r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
+    r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
+
+    if(add)
+        sum = _mm_add_ps(sum, r);
+    else
+        sum = r;
+
+    r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35
+    return _mm_cvttps_epi32(r);
+}
+
+template<size_t rot>
+inline void single_compute_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+{
+    __m128i r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
+    if(rot != 0)
+        r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot));
+    out = _mm_xor_si128(out, r);
+}
+
+template<uint32_t MASK>
+inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m128i*>(lpad + (idx & MASK) + n*16); }
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
+{
+    uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+    __m128i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
+    __m128i* idx1 = scratchpad_ptr<MASK>(lpad, s, 1);
+    __m128i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
+    __m128i* idx3 = scratchpad_ptr<MASK>(lpad, s, 3);
+    __m128 sum0 = _mm_setzero_ps();
+    
+    for(size_t i = 0; i < ITER; i++)
+    {
+        __m128 n0, n1, n2, n3;
+        __m128i v0, v1, v2, v3;
+        __m128 suma, sumb, sum1, sum2, sum3;
+        
+        prep_dv(idx0, v0, n0);
+        prep_dv(idx1, v1, n1);
+        prep_dv(idx2, v2, n2);
+        prep_dv(idx3, v3, n3);
+        __m128 rc = sum0;
+
+        __m128i out, out2;
+        out = _mm_setzero_si128();
+        single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
+        single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
+        single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
+        single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
+        sum0 = _mm_add_ps(suma, sumb);
+        _mm_store_si128(idx0, _mm_xor_si128(v0, out));
+        out2 = out;
+    
+        out = _mm_setzero_si128();
+        single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
+        single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
+        single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
+        single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
+        sum1 = _mm_add_ps(suma, sumb);
+        _mm_store_si128(idx1, _mm_xor_si128(v1, out));
+        out2 = _mm_xor_si128(out2, out);
+
+        out = _mm_setzero_si128();
+        single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
+        single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
+        single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
+        single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
+        sum2 = _mm_add_ps(suma, sumb);
+        _mm_store_si128(idx2, _mm_xor_si128(v2, out));
+        out2 = _mm_xor_si128(out2, out);
+
+        out = _mm_setzero_si128();
+        single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
+        single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
+        single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
+        single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
+        sum3 = _mm_add_ps(suma, sumb);
+        _mm_store_si128(idx3, _mm_xor_si128(v3, out));
+        out2 = _mm_xor_si128(out2, out);
+        sum0 = _mm_add_ps(sum0, sum1);
+        sum2 = _mm_add_ps(sum2, sum3);
+        sum0 = _mm_add_ps(sum0, sum2);
+
+        sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit
+        // vs range 0 - 64 
+        n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f));
+        v0 = _mm_cvttps_epi32(n0);
+        v0 = _mm_xor_si128(v0, out2);
+        v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
+        v0 = _mm_xor_si128(v0, v1);
+        v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
+        v0 = _mm_xor_si128(v0, v1);
+
+        // vs is now between 0 and 1
+        sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f));
+        uint32_t n = _mm_cvtsi128_si32(v0);
+        idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
+        idx1 = scratchpad_ptr<MASK>(lpad, n, 1);
+        idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
+        idx3 = scratchpad_ptr<MASK>(lpad, n, 3);
+    }
+}
+
+template void cn_gpu_inner_ssse3<xmrig::CRYPTONIGHT_GPU_ITER, xmrig::CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
--- a/src/crypto/cn/groestl_tables.h
+++ b/src/crypto/cn/groestl_tables.h
@@ -0,0 +1,38 @@
+#ifndef __tables_h
+#define __tables_h
+
+
+const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
+, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
+, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
+, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
+, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
+, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
+, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
+, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
+, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
+, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
+, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
+, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
+, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
+, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
+, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
+, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
+, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
+, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
+, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
+, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
+, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
+, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
+, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
+, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
+, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
+, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
+, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
+, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
+, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
+, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
+, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
+, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
+
+#endif /* __tables_h */
--- a/src/crypto/cn/hash.h
+++ b/src/crypto/cn/hash.h
@@ -0,0 +1,5 @@
+#pragma once
+
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
--- a/src/crypto/cn/r/CryptonightR_gen.cpp
+++ b/src/crypto/cn/r/CryptonightR_gen.cpp
@@ -0,0 +1,188 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cstring>
+#include "crypto/cn/CryptoNight_monero.h"
+
+typedef void(*void_func)();
+
+#include "crypto/cn/asm/CryptonightR_template.h"
+#include "crypto/common/VirtualMemory.h"
+#include "Mem.h"
+
+
+static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)())
+{
+    const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
+    if (size > 0) {
+        memcpy(p, reinterpret_cast<void*>(p1), size);
+        p += size;
+    }
+}
+
+static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, xmrig::Assembly ASM)
+{
+    uint32_t prev_rot_src = (uint32_t)(-1);
+
+    for (int i = 0;; ++i) {
+        const V4_Instruction inst = code[i];
+        if (inst.opcode == RET) {
+            break;
+        }
+
+        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+        uint8_t dst_index = inst.dst_index;
+        uint8_t src_index = inst.src_index;
+
+        const uint32_t a = inst.dst_index;
+        const uint32_t b = inst.src_index;
+        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+        switch (inst.opcode) {
+        case ROR:
+        case ROL:
+            if (b != prev_rot_src) {
+                prev_rot_src = b;
+                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
+            }
+            break;
+        }
+
+        if (a == prev_rot_src) {
+            prev_rot_src = (uint32_t)(-1);
+        }
+
+        void_func begin = instructions[c];
+
+        if ((ASM = xmrig::ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
+            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
+            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
+            uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
+
+            if (*prefix == 0x49) {
+                *(p++) = 0x41;
+            }
+
+            begin = reinterpret_cast<void_func>(prefix + 1);
+        }
+
+        add_code(p, begin, instructions[c + 1]);
+
+        if (inst.opcode == ADD) {
+            *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
+            if (is_64_bit) {
+                prev_rot_src = (uint32_t)(-1);
+            }
+        }
+    }
+}
+
+void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
+{
+    uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
+    uint8_t* p = p0;
+
+    add_code(p, CryptonightWOW_template_part1, CryptonightWOW_template_part2);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightWOW_template_part2, CryptonightWOW_template_part3);
+    *(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_template_mainloop) - ((const uint8_t*)CryptonightWOW_template_part1)) - (p - p0));
+    add_code(p, CryptonightWOW_template_part3, CryptonightWOW_template_end);
+
+    xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
+}
+
+void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
+{
+    uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
+    uint8_t* p = p0;
+
+    add_code(p, CryptonightR_template_part1, CryptonightR_template_part2);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightR_template_part2, CryptonightR_template_part3);
+    *(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
+    add_code(p, CryptonightR_template_part3, CryptonightR_template_end);
+
+    xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
+}
+
+void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
+{
+    uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
+    uint8_t* p = p0;
+
+    add_code(p, CryptonightWOW_template_double_part1, CryptonightWOW_template_double_part2);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightWOW_template_double_part2, CryptonightWOW_template_double_part3);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightWOW_template_double_part3, CryptonightWOW_template_double_part4);
+    *(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_template_double_mainloop) - ((const uint8_t*)CryptonightWOW_template_double_part1)) - (p - p0));
+    add_code(p, CryptonightWOW_template_double_part4, CryptonightWOW_template_double_end);
+
+    xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
+}
+
+void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
+{
+    uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
+    uint8_t* p = p0;
+
+    add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
+    *(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
+    add_code(p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
+
+    xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
+}
+
+void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
+{
+    uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
+    uint8_t* p = p0;
+
+    add_code(p, CryptonightWOW_soft_aes_template_part1, CryptonightWOW_soft_aes_template_part2);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightWOW_soft_aes_template_part2, CryptonightWOW_soft_aes_template_part3);
+    *(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightWOW_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightWOW_soft_aes_template_part1)) - (p - p0));
+    add_code(p, CryptonightWOW_soft_aes_template_part3, CryptonightWOW_soft_aes_template_end);
+
+    xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
+}
+
+void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM)
+{
+    uint8_t* p0 = reinterpret_cast<uint8_t*>(machine_code);
+    uint8_t* p = p0;
+
+    add_code(p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
+    add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
+    *(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
+    add_code(p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
+
+    xmrig::VirtualMemory::flushInstructionCache(machine_code, p - p0);
+}
--- a/src/crypto/cn/r/variant4_random_math.h
+++ b/src/crypto/cn/r/variant4_random_math.h
@@ -0,0 +1,448 @@
+#ifndef VARIANT4_RANDOM_MATH_H
+#define VARIANT4_RANDOM_MATH_H
+
+extern "C"
+{
+    #include "crypto/cn/c_blake256.h"
+}
+
+enum V4_Settings
+{
+	// Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+	TOTAL_LATENCY = 15 * 3,
+	
+	// Always generate at least 60 instructions
+	NUM_INSTRUCTIONS_MIN = 60,
+
+	// Never generate more than 70 instructions (final RET instruction doesn't count here)
+	NUM_INSTRUCTIONS_MAX = 70,
+
+	// Available ALUs for MUL
+	// Modern CPUs typically have only 1 ALU which can do multiplications
+	ALU_COUNT_MUL = 1,
+
+	// Total available ALUs
+	// Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+	ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+	MUL,	// a*b
+	ADD,	// a+b + C, C is an unsigned 32-bit constant
+	SUB,	// a-b
+	ROR,	// rotate right "a" by "b & 31" bits
+	ROL,	// rotate left "a" by "b & 31" bits
+	XOR,	// a^b
+	RET,	// finish execution
+	V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+	V4_OPCODE_BITS = 3,
+	V4_DST_INDEX_BITS = 2,
+	V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+	uint8_t opcode;
+	uint8_t dst_index;
+	uint8_t src_index;
+	uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#ifdef __GNUC__
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif _MSC_VER
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#ifdef __GNUC__
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif _MSC_VER
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+template<typename v4_reg>
+static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
+{
+	enum
+	{
+		REG_BITS = sizeof(v4_reg) * 8,
+	};
+
+#define V4_EXEC(i) \
+	{ \
+		const struct V4_Instruction* op = code + i; \
+		const v4_reg src = r[op->src_index]; \
+		v4_reg* dst = r + op->dst_index; \
+		switch (op->opcode) \
+		{ \
+		case MUL: \
+			*dst *= src; \
+			break; \
+		case ADD: \
+			*dst += src + op->C; \
+			break; \
+		case SUB: \
+			*dst -= src; \
+			break; \
+		case ROR: \
+			{ \
+				const uint32_t shift = src % REG_BITS; \
+				*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+			} \
+			break; \
+		case ROL: \
+			{ \
+				const uint32_t shift = src % REG_BITS; \
+				*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+			} \
+			break; \
+		case XOR: \
+			*dst ^= src; \
+			break; \
+		case RET: \
+			return; \
+		default: \
+			UNREACHABLE_CODE; \
+			break; \
+		} \
+	}
+
+#define V4_EXEC_10(j) \
+	V4_EXEC(j + 0) \
+	V4_EXEC(j + 1) \
+	V4_EXEC(j + 2) \
+	V4_EXEC(j + 3) \
+	V4_EXEC(j + 4) \
+	V4_EXEC(j + 5) \
+	V4_EXEC(j + 6) \
+	V4_EXEC(j + 7) \
+	V4_EXEC(j + 8) \
+	V4_EXEC(j + 9)
+
+	// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+	// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+	//
+	// 60      27960
+	// 61      105054
+	// 62      2452759
+	// 63      5115997
+	// 64      1022269
+	// 65      1109635
+	// 66      153145
+	// 67      8550
+	// 68      4529
+	// 69      102
+
+	// Unroll 70 instructions here
+	V4_EXEC_10(0);		// instructions 0-9
+	V4_EXEC_10(10);		// instructions 10-19
+	V4_EXEC_10(20);		// instructions 20-29
+	V4_EXEC_10(30);		// instructions 30-39
+	V4_EXEC_10(40);		// instructions 40-49
+	V4_EXEC_10(50);		// instructions 50-59
+	V4_EXEC_10(60);		// instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+	if (*data_index + bytes_needed > data_size)
+	{
+		hash_extra_blake(data, data_size, (char*) data);
+		*data_index = 0;
+	}
+}
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+template<xmrig::Variant VARIANT>
+static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+	// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+	// These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+	//
+	// AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+	// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+	// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+	// Source: https://www.agner.org/optimize/instruction_tables.pdf
+	const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+	// Instruction latencies for theoretical ASIC implementation
+	const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+	// Available ALUs for each instruction
+	const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+	int8_t data[32];
+	memset(data, 0, sizeof(data));
+	uint64_t tmp = SWAP64LE(height);
+	memcpy(data, &tmp, sizeof(uint64_t));
+	if (VARIANT == xmrig::VARIANT_4)
+	{
+		data[20] = -38;
+	}
+
+	// Set data_index past the last byte in data
+	// to trigger full data update with blake hash
+	// before we start using it
+	size_t data_index = sizeof(data);
+
+	int code_size;
+
+	// There is a small chance (1.8%) that register R8 won't be used in the generated program
+	// So we keep track of it and try again if it's not used
+	bool r8_used;
+	do {
+		int latency[9];
+		int asic_latency[9];
+
+		// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+		// byte 0: current value of the destination register
+		// byte 1: instruction opcode
+		// byte 2: current value of the source register
+		//
+		// Registers R4-R8 are constant and are treated as having the same value because when we do
+		// the same operation twice with two constant source registers, it can be optimized into a single operation
+		uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+		bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+		bool is_rotation[V4_INSTRUCTION_COUNT];
+		bool rotated[4];
+		int rotate_count = 0;
+
+		memset(latency, 0, sizeof(latency));
+		memset(asic_latency, 0, sizeof(asic_latency));
+		memset(alu_busy, 0, sizeof(alu_busy));
+		memset(is_rotation, 0, sizeof(is_rotation));
+		memset(rotated, 0, sizeof(rotated));
+		is_rotation[ROR] = true;
+		is_rotation[ROL] = true;
+
+		int num_retries = 0;
+		code_size = 0;
+
+		int total_iterations = 0;
+		r8_used = (VARIANT == xmrig::VARIANT_WOW);
+
+		// Generate random code to achieve minimal required latency for our abstract CPU
+		// Try to get this latency for all 4 registers
+		while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+		{
+			// Fail-safe to guarantee loop termination
+			++total_iterations;
+			if (total_iterations > 256)
+				break;
+
+			check_data(&data_index, 1, data, sizeof(data));
+
+			const uint8_t c = ((uint8_t*)data)[data_index++];
+
+			// MUL = opcodes 0-2
+			// ADD = opcode 3
+			// SUB = opcode 4
+			// ROR/ROL = opcode 5, shift direction is selected randomly
+			// XOR = opcodes 6-7
+			uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+			if (opcode == 5)
+			{
+				check_data(&data_index, 1, data, sizeof(data));
+				opcode = (data[data_index++] >= 0) ? ROR : ROL;
+			}
+			else if (opcode >= 6)
+			{
+				opcode = XOR;
+			}
+			else
+			{
+				opcode = (opcode <= 2) ? MUL : (opcode - 2);
+			}
+
+			uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+			uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+			const int a = dst_index;
+			int b = src_index;
+
+			// Don't do ADD/SUB/XOR with the same register
+			if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+			{
+				// a is always < 4, so we don't need to check bounds here
+				b = (VARIANT == xmrig::VARIANT_WOW) ? (a + 4) : 8;
+				src_index = b;
+			}
+
+			// Don't do rotation with the same destination twice because it's equal to a single rotation
+			if (is_rotation[opcode] && rotated[a])
+			{
+				continue;
+			}
+
+			// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+			// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+			// 2xXOR(a, b) = NOP
+			if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+			{
+				continue;
+			}
+
+			// Find which ALU is available (and when) for this instruction
+			int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+			int alu_index = -1;
+			while (next_latency < TOTAL_LATENCY)
+			{
+				for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+				{
+					if (!alu_busy[next_latency][i])
+					{
+						// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+						if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+						{
+							continue;
+						}
+
+						// Rotation can only start when previous rotation is finished, so do an additional availability check
+						if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+						{
+							continue;
+						}
+
+						alu_index = i;
+						break;
+					}
+				}
+				if (alu_index >= 0)
+				{
+					break;
+				}
+				++next_latency;
+			}
+
+			// Don't generate instructions that leave some register unchanged for more than 7 cycles
+			if (next_latency > latency[a] + 7)
+			{
+				continue;
+			}
+
+			next_latency += op_latency[opcode];
+
+			if (next_latency <= TOTAL_LATENCY)
+			{
+				if (is_rotation[opcode])
+				{
+					++rotate_count;
+				}
+
+				// Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+				alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+				latency[a] = next_latency;
+
+				// ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+				asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+				rotated[a] = is_rotation[opcode];
+
+				inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+				code[code_size].opcode = opcode;
+				code[code_size].dst_index = dst_index;
+				code[code_size].src_index = src_index;
+				code[code_size].C = 0;
+
+				if (src_index == 8)
+				{
+					r8_used = true;
+				}
+
+				if (opcode == ADD)
+				{
+					// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+					alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+					// ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+					check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+					uint32_t t;
+					memcpy(&t, data + data_index, sizeof(uint32_t));
+					code[code_size].C = SWAP32LE(t);
+					data_index += sizeof(uint32_t);
+				}
+
+				++code_size;
+				if (code_size >= NUM_INSTRUCTIONS_MIN)
+				{
+					break;
+				}
+			}
+			else
+			{
+				++num_retries;
+			}
+		}
+
+		// ASIC has more execution resources and can extract as much parallelism from the code as possible
+		// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+		// Get this latency for at least 1 of the 4 registers
+		const int prev_code_size = code_size;
+		while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+		{
+			int min_idx = 0;
+			int max_idx = 0;
+			for (int i = 1; i < 4; ++i)
+			{
+				if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+				if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+			}
+
+			const uint8_t pattern[3] = { ROR, MUL, MUL };
+			const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+			latency[min_idx] = latency[max_idx] + op_latency[opcode];
+			asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+			code[code_size].opcode = opcode;
+			code[code_size].dst_index = min_idx;
+			code[code_size].src_index = max_idx;
+			code[code_size].C = 0;
+			++code_size;
+		}
+
+	// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+	// It never does more than 4 iterations for all block heights < 10,000,000
+	}  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+	// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+	// Add final instruction to stop the interpreter
+	code[code_size].opcode = RET;
+	code[code_size].dst_index = 0;
+	code[code_size].src_index = 0;
+	code[code_size].C = 0;
+
+	return code_size;
+}
+
+#endif
--- a/src/crypto/cn/skein_port.h
+++ b/src/crypto/cn/skein_port.h
@@ -0,0 +1,187 @@
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+
+#include <limits.h>
+#include <stdint.h>
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize'
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a
+                                varaiable of length 'size' bits
+*/
+
+#define ui_type(size)               uint##size##_t
+#define dec_unit_type(size,x)       typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef uint8_t         u08b_t;             /*  8-bit unsigned integer */
+typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
+
+#ifndef RotL_64
+#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs.  The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if BYTE_ORDER == LITTLE_ENDIAN && !defined(PLATFORM_BYTE_ORDER)
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if BYTE_ORDER == BIG_ENDIAN && !defined(PLATFORM_BYTE_ORDER)
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#endif
+
+/* special handler for IA64, which may be either endianness (?)  */
+/* here we assume little-endian, but this may need to be changed */
+#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+#  define PLATFORM_MUST_ALIGN (1)
+#ifndef PLATFORM_BYTE_ORDER
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#endif
+
+#ifndef   PLATFORM_MUST_ALIGN
+#  define PLATFORM_MUST_ALIGN (0)
+#endif
+
+
+#if   PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+    /* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP   (1)
+#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP   (0)
+#if   PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
+#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+#endif
+#else
+#error "Skein needs endianness setting!"
+#endif
+
+#endif /* ifndef SKEIN_NEED_SWAP */
+
+/*
+ ******************************************************************
+ *      Provide any definitions still needed.
+ ******************************************************************
+ */
+#ifndef Skein_Swap64  /* swap for big-endian, nop for little-endian */
+#if     SKEIN_NEED_SWAP
+#define Skein_Swap64(w64)                       \
+  ( (( ((u64b_t)(w64))       & 0xFF) << 56) |   \
+    (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |   \
+    (((((u64b_t)(w64)) >>16) & 0xFF) << 40) |   \
+    (((((u64b_t)(w64)) >>24) & 0xFF) << 32) |   \
+    (((((u64b_t)(w64)) >>32) & 0xFF) << 24) |   \
+    (((((u64b_t)(w64)) >>40) & 0xFF) << 16) |   \
+    (((((u64b_t)(w64)) >>48) & 0xFF) <<  8) |   \
+    (((((u64b_t)(w64)) >>56) & 0xFF)      ) )
+#else
+#define Skein_Swap64(w64)  (w64)
+#endif
+#endif  /* ifndef Skein_Swap64 */
+
+
+#ifndef Skein_Put64_LSB_First
+void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<bCnt;n++)
+        dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
+    }
+#else
+    ;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Put64_LSB_First */
+
+
+#ifndef Skein_Get64_LSB_First
+void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<8*wCnt;n+=8)
+        dst[n/8] = (((u64b_t) src[n  ])      ) +
+                   (((u64b_t) src[n+1]) <<  8) +
+                   (((u64b_t) src[n+2]) << 16) +
+                   (((u64b_t) src[n+3]) << 24) +
+                   (((u64b_t) src[n+4]) << 32) +
+                   (((u64b_t) src[n+5]) << 40) +
+                   (((u64b_t) src[n+6]) << 48) +
+                   (((u64b_t) src[n+7]) << 56) ;
+    }
+#else
+    ;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Get64_LSB_First */
+
+#endif   /* ifndef _SKEIN_PORT_H_ */
--- a/src/crypto/cn/soft_aes.h
+++ b/src/crypto/cn/soft_aes.h
@@ -0,0 +1,146 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+/*
+ * Parts of this file are originally copyright (c) 2014-2017, The Monero Project
+ */
+#pragma once
+
+
+#if defined(XMRIG_ARM)
+#   include "crypto/SSE2NEON.h"
+#elif defined(__GNUC__)
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#endif
+
+#include <inttypes.h>
+
+
+#define saes_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define SAES_WPOLY           0x011b
+
+#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
+    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+
+#define saes_f2(x)   ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
+#define saes_f3(x)   (saes_f2(x) ^ x)
+#define saes_h0(x)   (x)
+
+#define saes_u0(p)   saes_b2w(saes_f2(p),          p,          p, saes_f3(p))
+#define saes_u1(p)   saes_b2w(saes_f3(p), saes_f2(p),          p,          p)
+#define saes_u2(p)   saes_b2w(         p, saes_f3(p), saes_f2(p),          p)
+#define saes_u3(p)   saes_b2w(         p,          p, saes_f3(p), saes_f2(p))
+
+alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
+alignas(16) const uint8_t  saes_sbox[256] = saes_data(saes_h0);
+
+static inline __m128i soft_aesenc(const uint32_t* in, __m128i key)
+{
+    const uint32_t x0 = in[0];
+    const uint32_t x1 = in[1];
+    const uint32_t x2 = in[2];
+    const uint32_t x3 = in[3];
+
+    __m128i out = _mm_set_epi32(
+        (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
+        (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
+        (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
+        (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, key);
+}
+
+static inline __m128i soft_aesenc(__m128i in, __m128i key)
+{
+    uint32_t x0, x1, x2, x3;
+    x0 = _mm_cvtsi128_si32(in);
+    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
+        (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
+        (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
+        (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, key);
+}
+
+static inline uint32_t sub_word(uint32_t key)
+{
+    return (saes_sbox[key >> 24 ] << 24)   | 
+        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
+        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+         saes_sbox[key & 0xff];
+}
+
+#ifndef HAVE_ROTR
+static inline uint32_t _rotr(uint32_t value, uint32_t amount)
+{
+    return (value >> amount) | (value << ((32 - amount) & 31));
+}
+#endif
+
+template<uint8_t rcon>
+static inline __m128i soft_aeskeygenassist(__m128i key)
+{
+    const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
+    const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
+    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
+}