Fixed macOS build.

Fix compile warnings.
Add renaming ASM codes & update from upstream.
2026-01-23 14:52:52 -05:00 · 2019-03-05 01:07:01 +07:00 · 2019-03-05 00:49:04 +07:00 · 2019-03-05 00:41:01 +07:00 · 2019-03-04 19:25:59 +07:00 · 2019-03-04 13:31:25 +07:00
105 changed files with 15920 additions and 4026 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,34 @@
+# v0.9.0
+- **[#753](https://github.com/xmrig/xmrig/issues/753) Added new algorithm [CryptoNight variant 2](https://github.com/xmrig/xmrig/issues/753) for Monero fork, thanks [@SChernykh](https://github.com/SChernykh).**
+  - Added option `--asm`, possible values `--asm auto`, `--asm none`, `--asm intel` and `--asm ryzen`.
+- Added support for new style long and short algorithm names, possible values: `cryptonight`, `cryptonight/0`, `cryptonight/1`, `cryptonight/2`, `cryptonight-lite`, `cryptonight-lite/0`, `cryptonight-lite/1` and short equvalents `cn/2` etc. 
+- Added `--variant`, example `--algo cn --variant 2`, by default miner automaticaly detect proper variant for Monero by block version.  
+- Added CryptoNight-Lite variant 1.
+- Added xmrig-proxy autodetection, nicehash will be enabled automaticaly. 
+- Added workaround for xmrig-proxy [bug](https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89).
+
+# v0.8.2
+- Fixed L2 cache size detection for AMD CPUs (Bulldozer/Piledriver/Steamroller/Excavator architecture).
+- Fixed gcc 7.1 support.
+
+# v0.8.1
+- Added nicehash support, detects automaticaly by pool URL, for example `cryptonight.eu.nicehash.com:3355` or manually via option `--nicehash`.
+
+# v0.8.0
+- Added double hash mode, also known as lower power mode. `--av=2` and `--av=4`.
+- Added smart automatic CPU configuration. Default threads count now depends on size of the L3 cache of CPU.
+- Added CryptoNight-Lite support for AEON `-a cryptonight-lite`.
+- Added `--max-cpu-usage` option for auto CPU configuration mode.
+- Added `--safe` option for adjust threads and algorithm variations to current CPU.
+- No more manual steps to enable huge pages on Windows. XMRig will do it automatically.
+- Removed BMI2 algorithm variation.
+- Removed default pool URL.
+
+# v0.6.0
+- Added automatic cryptonight self test.
+- New software AES algorithm variation. Will be automatically selected if cpu not support AES-NI.
+- Added 32 bit builds.
+- Documented [algorithm variations](https://github.com/xmrig/xmrig#algorithm-variations).
+
+# v0.5.0
+- Initial public release.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,28 @@
 cmake_minimum_required(VERSION 3.0)
 project(xmrig C)

+option(WITH_LIBCPUID "Use Libcpuid" ON)
+option(WITH_AEON     "CryptoNight-Lite support" ON)
+option(WITH_ASM      "Enable ASM PoW implementations" ON)
+
 set(HEADERS
-    compat.h
    algo/cryptonight/cryptonight.h
-    elist.h
-    xmrig.h
-    version.h
-    options.h
+    algo/cryptonight/cryptonight_aesni.h
+    algo/cryptonight/cryptonight_monero.h
+    algo/cryptonight/cryptonight_softaes.h
+    algo/cryptonight/cryptonight_test.h
+    algo/cryptonight/variant4_random_math.h
+    compat.h
    cpu.h
-    persistent_memory.h
-    stratum.h
-    stats.h
-    util.h
    donate.h
+    elist.h
+    options.h
+    persistent_memory.h
+    stats.h
+    stratum.h
+    util.h
+    version.h
+    xmrig.h
   )

 set(HEADERS_CRYPTO
@@ -21,9 +30,7 @@ set(HEADERS_CRYPTO
    crypto/c_blake256.h
    crypto/c_jh.h
    crypto/c_skein.h
-    crypto/oaes_lib.h
-    crypto/oaes_config.h
-    crypto/aesb.h
+    crypto/soft_aes.h
   )

 set(HEADERS_COMPAT
@@ -38,10 +45,17 @@ set(HEADERS_UTILS

 set(SOURCES
    xmrig.c
-    algo/cryptonight/cryptonight_common.c
+    algo/cryptonight/cryptonight.c
+    algo/cryptonight/cryptonight_av1.c
+    algo/cryptonight/cryptonight_av2.c
+    algo/cryptonight/cryptonight_av3.c
+    algo/cryptonight/cryptonight_av4.c
+    algo/cryptonight/cryptonight_r_av1.c
+    algo/cryptonight/cryptonight_r_av2.c
+    algo/cryptonight/cryptonight_r_av3.c
+    algo/cryptonight/cryptonight_r_av4.c
    util.c
    options.c
-    cpu.c
    stratum.c
    stats.c
    memory.c
@@ -53,8 +67,6 @@ set(SOURCES_CRYPTO
    crypto/c_blake256.c
    crypto/c_jh.c
    crypto/c_skein.c
-    crypto/oaes_lib.c
-    crypto/aesb.c
   )

 set(SOURCES_UTILS
@@ -63,27 +75,34 @@ set(SOURCES_UTILS
   )

 if (WIN32)
-    set(SOURCES_OS win/cpu_win.c win/memory_win.c win/xmrig_win.c compat/winansi.c)
-    set(EXTRA_LIBS ws2_32)
+    set(SOURCES_OS win/cpu_win.c win/memory_win.c win/xmrig_win.c win/app.rc compat/winansi.c)
+    set(EXTRA_LIBS ws2_32 crypt32)
    add_definitions(/D_WIN32_WINNT=0x600)
+elseif (APPLE)
+    set(SOURCES_OS mac/cpu_mac.c mac/memory_mac.c mac/xmrig_mac.c)
 else()
    set(SOURCES_OS unix/cpu_unix.c unix/memory_unix.c unix/xmrig_unix.c)
-    set(EXTRA_LIBS pthread)
+    set(EXTRA_LIBS pthread rt m)
 endif()

 include_directories(.)
 add_definitions(/DUSE_NATIVE_THREADS)
 add_definitions(/D_GNU_SOURCE)
-add_definitions(/DDEBUG_THREADS)
+add_definitions(/DUNICODE)

 if ("${CMAKE_BUILD_TYPE}" STREQUAL "")
    set(CMAKE_BUILD_TYPE Release)
 endif()

-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -mbmi2")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -Wno-pointer-to-int-cast")
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -gdwarf-2")
+
+if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants")
+else()
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
+endif()
+
+#set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -gdwarf-2")
 #set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate")
 #set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fprofile-use -fprofile-correction")

@@ -94,34 +113,44 @@ endif()
 include_directories(compat/jansson)
 add_subdirectory(compat/jansson)

-find_package(CURL REQUIRED)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")

-if (CURL_FOUND)
-    include_directories(${CURL_INCLUDE_DIRS})
-    add_definitions(/DCURL_STATICLIB)
-    link_directories(${CURL_LIBRARIES})
+find_package(CURL REQUIRED)
+include_directories(${CURL_INCLUDE_DIRS})
+add_definitions(/DCURL_STATICLIB)
+link_directories(${CURL_LIBRARIES})
+
+if (WITH_LIBCPUID)
+    add_subdirectory(compat/libcpuid)
+
+    include_directories(compat/libcpuid)
+    set(CPUID_LIB cpuid)
+    set(SOURCES_CPUID cpu.c)
+else()
+    add_definitions(/DXMRIG_NO_LIBCPUID)
+    set(SOURCES_CPUID cpu_stub.c)
+endif()
+
+include(cmake/asm.cmake)
+
+if (WITH_AEON)
+    set(SOURCES_AEON
+    algo/cryptonight-lite/cryptonight_lite_av1.c
+    algo/cryptonight-lite/cryptonight_lite_av2.c
+    algo/cryptonight-lite/cryptonight_lite_av3.c
+    algo/cryptonight-lite/cryptonight_lite_av4.c
+    algo/cryptonight-lite/cryptonight_lite_aesni.h
+    algo/cryptonight-lite/cryptonight_lite_softaes.h
+    )
+else()
+    add_definitions(/DXMRIG_NO_AEON)
 endif()

 if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    add_subdirectory(algo/cryptonight/bmi2)
-
-    set(CRYPTONIGHT64
-        algo/cryptonight/cryptonight_av1_aesni.c
-        algo/cryptonight/cryptonight_av2_aesni_wolf.c
-        algo/cryptonight/cryptonight_av4_legacy.c
-        algo/cryptonight/cryptonight_av5_aesni_experimental.c
-    )
-
-    add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${CRYPTONIGHT64})
-    target_link_libraries(xmrig jansson curl cryptonight_av3_aesni_bmi2 ${EXTRA_LIBS})
+    add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
+    target_link_libraries(xmrig ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
 else()
-    set(CRYPTONIGHT32
-        algo/cryptonight/cryptonight_av1_aesni32.c
-        algo/cryptonight/cryptonight_av4_legacy.c
-    )
-
-    add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${CRYPTONIGHT32})
-    target_link_libraries(xmrig32 jansson -L${CURL_LIBRARIES} ${EXTRA_LIBS})
+    add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
+    target_link_libraries(xmrig32 ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
 endif()

-source_group("HEADERS" FILES ${HEADERS})
--- a/README.md
+++ b/README.md
@@ -2,24 +2,29 @@
 XMRig is high performance Monero (XMR) CPU miner, with the official full Windows support.
 Based on cpuminer-multi with heavy optimizations/rewrites and removing a lot of legacy code.

-<img src="https://i.imgur.com/GhmdK2f.png" width="480">
+<img src="http://i.imgur.com/GdRDnAu.png" width="596" >

 #### Table of contents
 * [Features](#features)
 * [Download](#download)
 * [Usage](#usage)
+* [Algorithm variations](#algorithm-variations)
 * [Build](#build)
 * [Common Issues](#common-issues)
 * [Other information](#other-information)
-* [Donations](#Donations)
+* [Donations](#donations)
+* [Contacts](#contacts)

 ## Features
-* High performance, faster than others (290+ H/s on i7 6700).
+* High performance (290+ H/s on i7 6700).
 * Official Windows support.
-* Small Windows executable, only 350 KB without dependencies.
+* Small Windows executable, only 535 KB without dependencies.
 * Support for backup (failover) mining server.
 * keepalived support.
 * Command line options compatible with cpuminer.
+* CryptoNight-Lite support for AEON.
+* Smart automatic [CPU configuration](https://github.com/xmrig/xmrig/wiki/Threads).
+* Nicehash support
 * It's open source software.

 ## Download
@@ -30,11 +35,12 @@ Based on cpuminer-multi with heavy optimizations/rewrites and removing a lot of
 ## Usage
 ### Basic example
 ```
-xmrig.exe -o xmr-eu.dwarfpool.com:8005 -b xmr-usa.dwarfpool.com:8005 -u YOUR_WALLET -p x -k
+xmrig.exe -o xmr-eu.dwarfpool.com:8005 -u YOUR_WALLET -p x -k
 ```

 ### Options
 ```
+  -a, --algo=ALGO       cryptonight (default) or cryptonight-lite
  -o, --url=URL         URL of mining server
  -b, --backup-url=URL  URL of backup mining server
  -O, --userpass=U:P    username:password pair for mining server
@@ -50,10 +56,20 @@ xmrig.exe -o xmr-eu.dwarfpool.com:8005 -b xmr-usa.dwarfpool.com:8005 -u YOUR_WAL
      --donate-level=N  donate level, default 5% (5 minutes in 100 minutes)
  -B, --background      run the miner in the background
  -c, --config=FILE     load a JSON-format configuration file
+      --max-cpu-usage=N maximum cpu usage for automatic threads mode (default 75)
+      --safe            safe adjust threads and av settings for current cpu
+      --nicehash        enable nicehash support
  -h, --help            display this help and exit
  -V, --version         output version information and exit
 ```

+## Algorithm variations
+Since version 0.8.0.
+* `--av=1` For CPUs with hardware AES.
+* `--av=2` Lower power mode (double hash) of `1`.
+* `--av=3` Software AES implementation.
+* `--av=4` Lower power mode (double hash) of `3`.
+
 ## Build
 ### Ubuntu (Debian-based distros)
 ```
@@ -62,28 +78,40 @@ git clone https://github.com/xmrig/xmrig.git
 cd xmrig
 mkdir build
 cd build
-cmake ..
+cmake .. -DCMAKE_BUILD_TYPE=Release
 make
 ```

 ### Windows
 It's complicated, you need [MSYS2](http://www.msys2.org/), custom libcurl build, and of course CMake too.
+
+Necessary MSYS2 packages:
+```
+pacman -Sy
+pacman -S mingw-w64-x86_64-gcc
+pacman -S make
+pacman -S mingw-w64-x86_64-cmake
+pacman -S mingw-w64-x86_64-pkg-config
+```
 Configure options for libcurl:
 ```
-./configure --disable-shared --enable-optimize --enable-threaded-resolver --disable-libcurl-option --disable-ares --disable-rt --disable-ftp --disable-file --disable-ldap --disable-ldaps --disable-rtsp --disable-dict --disable-telnet --disable-tftp --disable-pop3 --disable-imap --disable-smb --disable-smtp --disable-gopher --disable-manual --disable-ipv6 --disable-sspi --disable-crypto-auth --disable-ntlm-wb --disable-tls-srp --disable-unix-sockets --without-zlib --without-winssl --without-ssl --without-libssh2 --without-nghttp2 --disable-cookies --without-ca-bundle
+./configure --disable-shared --enable-optimize --enable-threaded-resolver --disable-libcurl-option --disable-ares --disable-rt --disable-ftp --disable-file --disable-ldap --disable-ldaps --disable-rtsp --disable-dict --disable-telnet --disable-tftp --disable-pop3 --disable-imap --disable-smb --disable-smtp --disable-gopher --disable-manual --disable-ipv6 --disable-sspi --disable-crypto-auth --disable-ntlm-wb --disable-tls-srp --disable-unix-sockets --without-zlib --without-winssl --without-ssl --without-libssh2 --without-nghttp2 --disable-cookies --without-ca-bundle --without-librtmp
 ```
 CMake options:
 ```
-cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\<path>\curl-7.53.1\include" -DCURL_LIBRARY="c:\<path>\curl-7.53.1\lib\.libs"
+cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\xmrig-deps\gcc\x64\include" -DCURL_LIBRARY="c:\xmrig-deps\gcc\x64\lib\libcurl.a"
 ```

+### Optional features
+`-DWITH_LIBCPUID=OFF` Disable libcpuid. Auto configuration of CPU after this will be very limited.
+`-DWITH_AEON=OFF` Disable CryptoNight-Lite support.
+
 ## Common Issues
 ### HUGE PAGES unavailable
 * Run XMRig as Administrator.
-* Enable SeLockMemoryPrivilege. For Windows 7 pro, or Windows 8 and above see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx).
+* Since version 0.8.0 XMRig automatically enable SeLockMemoryPrivilege for current user, but reboot or sign out still required. [Manual instruction](https://msdn.microsoft.com/en-gb/library/ms190730.aspx).

 ## Other information
-* Now only support 64 bit operating systems (Windows/Linux).
 * No HTTP support, only stratum protocol support.
 * No TLS support.
 * Default donation 5% (5 minutes in 100 minutes) can be reduced to 1% via command line option `--donate-level`.
@@ -105,3 +133,7 @@ Please note performance is highly dependent on system load. The numbers above ar
 ## Donations
 * XMR: `48edfHu7V9Z84YzzMa6fUueoELZ9ZRXq9VetWzYGzKt52XU5xvqgzYnDK9URnRoJMk1j8nLwEVsaSWJ4fhdUyZijBGUicoD`
 * BTC: `1P7ujsXeX7GxQwHNnJsRMgAdNkFZmNVqJT`
+
+## Contacts
+* support@xmrig.com
+* [reddit](https://www.reddit.com/user/XMRig/)
--- a/algo/cryptonight-lite/cryptonight_lite_aesni.h
+++ b/algo/cryptonight-lite/cryptonight_lite_aesni.h
@@ -0,0 +1,274 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ *
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_LITE_AESNI_H
+#define XMRIG_CRYPTONIGHT_LITE_AESNI_H
+
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+
+#define aes_genkey_sub(imm8) \
+    __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, (imm8)); \
+    xout1  = _mm_shuffle_epi32(xout1, 0xFF); \
+    *xout0 = sl_xor(*xout0); \
+    *xout0 = _mm_xor_si128(*xout0, xout1); \
+    xout1  = _mm_aeskeygenassist_si128(*xout0, 0x00);\
+    xout1  = _mm_shuffle_epi32(xout1, 0xAA); \
+    *xout2 = sl_xor(*xout2); \
+    *xout2 = _mm_xor_si128(*xout2, xout1); \
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static inline __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+static inline void aes_genkey_sub1(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x1)
+}
+
+
+static inline void aes_genkey_sub2(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x2)
+}
+
+
+static inline void aes_genkey_sub4(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x4)
+}
+
+
+static inline void aes_genkey_sub8(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x8)
+}
+
+
+static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    *x0 = _mm_aesenc_si128(*x0, key);
+    *x1 = _mm_aesenc_si128(*x1, key);
+    *x2 = _mm_aesenc_si128(*x2, key);
+    *x3 = _mm_aesenc_si128(*x3, key);
+    *x4 = _mm_aesenc_si128(*x4, key);
+    *x5 = _mm_aesenc_si128(*x5, key);
+    *x6 = _mm_aesenc_si128(*x6, key);
+    *x7 = _mm_aesenc_si128(*x7, key);
+}
+
+
+static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = xout0;
+    *k1 = xout2;
+
+     aes_genkey_sub1(&xout0, &xout2);
+    *k2 = xout0;
+    *k3 = xout2;
+
+     aes_genkey_sub2(&xout0, &xout2);
+    *k4 = xout0;
+    *k5 = xout2;
+
+     aes_genkey_sub4(&xout0, &xout2);
+    *k6 = xout0;
+    *k7 = xout2;
+
+     aes_genkey_sub8(&xout0, &xout2);
+    *k8 = xout0;
+    *k9 = xout2;
+}
+
+
+static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xin0 = _mm_load_si128(input + 4);
+    xin1 = _mm_load_si128(input + 5);
+    xin2 = _mm_load_si128(input + 6);
+    xin3 = _mm_load_si128(input + 7);
+    xin4 = _mm_load_si128(input + 8);
+    xin5 = _mm_load_si128(input + 9);
+    xin6 = _mm_load_si128(input + 10);
+    xin7 = _mm_load_si128(input + 11);
+
+    for (size_t i = 0; __builtin_expect(i < MEMORY_LITE / sizeof(__m128i), 1); i += 8) {
+        aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+
+        _mm_store_si128(output + i + 0, xin0);
+        _mm_store_si128(output + i + 1, xin1);
+        _mm_store_si128(output + i + 2, xin2);
+        _mm_store_si128(output + i + 3, xin3);
+        _mm_store_si128(output + i + 4, xin4);
+        _mm_store_si128(output + i + 5, xin5);
+        _mm_store_si128(output + i + 6, xin6);
+        _mm_store_si128(output + i + 7, xin7);
+    }
+}
+
+
+static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm_load_si128(output + 4);
+    xout1 = _mm_load_si128(output + 5);
+    xout2 = _mm_load_si128(output + 6);
+    xout3 = _mm_load_si128(output + 7);
+    xout4 = _mm_load_si128(output + 8);
+    xout5 = _mm_load_si128(output + 9);
+    xout6 = _mm_load_si128(output + 10);
+    xout7 = _mm_load_si128(output + 11);
+
+    for (size_t i = 0; __builtin_expect(i < MEMORY_LITE / sizeof(__m128i), 1); i += 8)
+    {
+        xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+        xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+        xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+        xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+        xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+        xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+        xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+        xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+        aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+    }
+
+    _mm_store_si128(output + 4, xout0);
+    _mm_store_si128(output + 5, xout1);
+    _mm_store_si128(output + 6, xout2);
+    _mm_store_si128(output + 7, xout3);
+    _mm_store_si128(output + 8, xout4);
+    _mm_store_si128(output + 9, xout5);
+    _mm_store_si128(output + 10, xout6);
+    _mm_store_si128(output + 11, xout7);
+}
+
+
+#if defined(__x86_64__)
+#   define EXTRACT64(X) _mm_cvtsi128_si64(X)
+
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
+{
+    unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
+    *hi = r >> 64;
+    return (uint64_t) r;
+}
+#elif defined(__i386__)
+#   define HI32(X) \
+    _mm_srli_si128((X), 4)
+
+
+#   define EXTRACT64(X) \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+
+static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
+    // multiplier   = ab = a * 2^32 + b
+    // multiplicand = cd = c * 2^32 + d
+    // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+    uint64_t a = multiplier >> 32;
+    uint64_t b = multiplier & 0xFFFFFFFF;
+    uint64_t c = multiplicand >> 32;
+    uint64_t d = multiplicand & 0xFFFFFFFF;
+
+    //uint64_t ac = a * c;
+    uint64_t ad = a * d;
+    //uint64_t bc = b * c;
+    uint64_t bd = b * d;
+
+    uint64_t adbc = ad + (b * c);
+    uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+    // multiplier * multiplicand = product_hi * 2^64 + product_lo
+    uint64_t product_lo = bd + (adbc << 32);
+    uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+    *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+    return product_lo;
+}
+#endif
+
+
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_LITE_AESNI_H */
--- a/algo/cryptonight-lite/cryptonight_lite_av1.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av1.c
@@ -0,0 +1,134 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "crypto/c_keccak.h"
+#include "cryptonight_lite_aesni.h"
+
+
+void cryptonight_lite_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_lite_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av2.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av2.c
@@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "cryptonight_lite_aesni.h"
+#include "crypto/c_keccak.h"
+
+
+void cryptonight_lite_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
+}
+
+
+void cryptonight_lite_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av3.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av3.c
@@ -0,0 +1,134 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "cryptonight_lite_softaes.h"
+#include "crypto/c_keccak.h"
+
+
+void cryptonight_lite_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_lite_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av4.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av4.c
@@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "cryptonight_lite_softaes.h"
+#include "crypto/c_keccak.h"
+
+
+void cryptonight_lite_av4_v0(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_lite_av4_v1(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_softaes.h
+++ b/algo/cryptonight-lite/cryptonight_lite_softaes.h
@@ -0,0 +1,255 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
+#define XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
+
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+
+#include "crypto/soft_aes.h"
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static inline __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
+{
+    __m128i xout1 = soft_aeskeygenassist(*xout2, rcon);
+    xout1  = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    *xout0 = sl_xor(*xout0);
+    *xout0 = _mm_xor_si128(*xout0, xout1);
+    xout1  = soft_aeskeygenassist(*xout0, 0x00);
+    xout1  = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    *xout2 = sl_xor(*xout2);
+    *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+
+static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    *x0 = soft_aesenc(*x0, key);
+    *x1 = soft_aesenc(*x1, key);
+    *x2 = soft_aesenc(*x2, key);
+    *x3 = soft_aesenc(*x3, key);
+    *x4 = soft_aesenc(*x4, key);
+    *x5 = soft_aesenc(*x5, key);
+    *x6 = soft_aesenc(*x6, key);
+    *x7 = soft_aesenc(*x7, key);
+}
+
+
+static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = xout0;
+    *k1 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x1);
+    *k2 = xout0;
+    *k3 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x2);
+    *k4 = xout0;
+    *k5 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x4);
+    *k6 = xout0;
+    *k7 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x8);
+    *k8 = xout0;
+    *k9 = xout2;
+}
+
+
+static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xin0 = _mm_load_si128(input + 4);
+    xin1 = _mm_load_si128(input + 5);
+    xin2 = _mm_load_si128(input + 6);
+    xin3 = _mm_load_si128(input + 7);
+    xin4 = _mm_load_si128(input + 8);
+    xin5 = _mm_load_si128(input + 9);
+    xin6 = _mm_load_si128(input + 10);
+    xin7 = _mm_load_si128(input + 11);
+
+    for (size_t i = 0; i < MEMORY_LITE / sizeof(__m128i); i += 8) {
+        aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+
+        _mm_store_si128(output + i + 0, xin0);
+        _mm_store_si128(output + i + 1, xin1);
+        _mm_store_si128(output + i + 2, xin2);
+        _mm_store_si128(output + i + 3, xin3);
+        _mm_store_si128(output + i + 4, xin4);
+        _mm_store_si128(output + i + 5, xin5);
+        _mm_store_si128(output + i + 6, xin6);
+        _mm_store_si128(output + i + 7, xin7);
+    }
+}
+
+
+static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm_load_si128(output + 4);
+    xout1 = _mm_load_si128(output + 5);
+    xout2 = _mm_load_si128(output + 6);
+    xout3 = _mm_load_si128(output + 7);
+    xout4 = _mm_load_si128(output + 8);
+    xout5 = _mm_load_si128(output + 9);
+    xout6 = _mm_load_si128(output + 10);
+    xout7 = _mm_load_si128(output + 11);
+
+    for (size_t i = 0; __builtin_expect(i < MEMORY_LITE / sizeof(__m128i), 1); i += 8)
+    {
+        xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+        xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+        xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+        xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+        xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+        xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+        xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+        xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+        aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+    }
+
+    _mm_store_si128(output + 4, xout0);
+    _mm_store_si128(output + 5, xout1);
+    _mm_store_si128(output + 6, xout2);
+    _mm_store_si128(output + 7, xout3);
+    _mm_store_si128(output + 8, xout4);
+    _mm_store_si128(output + 9, xout5);
+    _mm_store_si128(output + 10, xout6);
+    _mm_store_si128(output + 11, xout7);
+}
+
+
+#if defined(__x86_64__)
+#   define EXTRACT64(X) _mm_cvtsi128_si64(X)
+
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
+{
+    unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
+    *hi = r >> 64;
+    return (uint64_t) r;
+}
+#elif defined(__i386__)
+#   define HI32(X) \
+    _mm_srli_si128((X), 4)
+
+
+#   define EXTRACT64(X) \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+
+static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
+    // multiplier   = ab = a * 2^32 + b
+    // multiplicand = cd = c * 2^32 + d
+    // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+    uint64_t a = multiplier >> 32;
+    uint64_t b = multiplier & 0xFFFFFFFF;
+    uint64_t c = multiplicand >> 32;
+    uint64_t d = multiplicand & 0xFFFFFFFF;
+
+    //uint64_t ac = a * c;
+    uint64_t ad = a * d;
+    //uint64_t bc = b * c;
+    uint64_t bd = b * d;
+
+    uint64_t adbc = ad + (b * c);
+    uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+    // multiplier * multiplicand = product_hi * 2^64 + product_lo
+    uint64_t product_lo = bd + (adbc << 32);
+    uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+    *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+    return product_lo;
+}
+#endif
+
+
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_LITE_SOFTAES_H */
--- a/algo/cryptonight/bmi2/CMakeLists.txt
+++ b/algo/cryptonight/bmi2/CMakeLists.txt
@@ -1,2 +0,0 @@
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mbmi2")
-add_library(cryptonight_av3_aesni_bmi2 STATIC ../cryptonight_av3_aesni_bmi2.c)
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -0,0 +1,407 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mm_malloc.h>
+
+
+#ifndef BUILD_TEST
+#   include "xmrig.h"
+#endif
+
+#include "cpu.h"
+#include "crypto/c_blake256.h"
+#include "crypto/c_groestl.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "cryptonight_test.h"
+#include "cryptonight.h"
+#include "options.h"
+#include "persistent_memory.h"
+
+
+static cn_hash_fun asm_func_map[AV_MAX][VARIANT_MAX][ASM_MAX] = {};
+
+
+void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av1_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av2_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av3_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+
+void cryptonight_r_av1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av3(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av4(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+
+
+#ifndef XMRIG_NO_AEON
+void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+#endif
+
+
+#ifndef XMRIG_NO_ASM
+void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_single_hash_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+
+void cryptonight_r_av1_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av1_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+#endif
+
+
+static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
+{
+    cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
+    if (func == NULL) {
+        return false;
+    }
+
+    func(test_input, 76, output, ctx);
+
+    return memcmp(output, referenceValue, opt_double_hash ? 64 : 32) == 0;
+}
+
+
+static inline bool verify2(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
+{
+    cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
+    if (func == NULL) {
+        return false;
+    }
+
+    if (opt_double_hash) {
+        uint8_t input[128];
+
+        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
+            const size_t size = cn_r_test_input[i].size;
+            memcpy(input,        cn_r_test_input[i].data, size);
+            memcpy(input + size, cn_r_test_input[i].data, size);
+
+            ctx[0]->height = ctx[1]->height = cn_r_test_input[i].height;
+
+            func(input, size, output, ctx);
+
+            if (memcmp(output, referenceValue + i * 32, 32) != 0 || memcmp(output + 32, referenceValue + i * 32, 32) != 0) {
+                return false;
+            }
+        }
+    }
+    else {
+        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
+            ctx[0]->height = cn_r_test_input[i].height;
+
+            func(cn_r_test_input[i].data, cn_r_test_input[i].size, output, ctx);
+
+            if (memcmp(output, referenceValue + i * 32, 32) != 0) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+
+static bool self_test() {
+    struct cryptonight_ctx *ctx[2];
+    uint8_t output[64];
+
+    const size_t count = opt_double_hash ? 2 : 1;
+    const size_t size  = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE;
+    bool result = false;
+
+    for (size_t i = 0; i < count; ++i) {
+        ctx[i]         = _mm_malloc(sizeof(struct cryptonight_ctx), 16);
+        ctx[i]->memory = _mm_malloc(size, 16);
+
+        init_cn_r(ctx[i]);
+    }
+
+    if (opt_algo == ALGO_CRYPTONIGHT) {
+        result = verify(VARIANT_0,  output, ctx, test_output_v0) &&
+                 verify(VARIANT_1,  output, ctx, test_output_v1) &&
+                 verify(VARIANT_2,  output, ctx, test_output_v2) &&
+                 verify2(VARIANT_4, output, ctx, test_output_r);
+    }
+#   ifndef XMRIG_NO_AEON
+    else {
+        result = verify(VARIANT_0, output, ctx, test_output_v0_lite) &&
+                 verify(VARIANT_1, output, ctx, test_output_v1_lite);
+    }
+#   endif
+
+
+    for (size_t i = 0; i < count; ++i) {
+        _mm_free(ctx[i]->memory);
+        _mm_free(ctx[i]);
+    }
+
+    return result;
+}
+
+
+#ifndef XMRIG_NO_ASM
+cn_hash_fun cryptonight_hash_asm_fn(enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
+{
+    if (assembly == ASM_AUTO) {
+        assembly = (enum Assembly) cpu_info.assembly;
+    }
+
+    if (assembly == ASM_NONE) {
+        return NULL;
+    }
+
+    return asm_func_map[av][variant][assembly];
+}
+#endif
+
+
+cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant)
+{
+    assert(av > AV_AUTO && av < AV_MAX);
+    assert(variant > VARIANT_AUTO && variant < VARIANT_MAX);
+
+#   ifndef XMRIG_NO_ASM
+    if (algorithm == ALGO_CRYPTONIGHT) {
+        cn_hash_fun fun = cryptonight_hash_asm_fn(av, variant, opt_assembly);
+        if (fun) {
+            return fun;
+        }
+    }
+#   endif
+
+    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
+        cryptonight_av1_v0,
+        cryptonight_av2_v0,
+        cryptonight_av3_v0,
+        cryptonight_av4_v0,
+        cryptonight_av1_v1,
+        cryptonight_av2_v1,
+        cryptonight_av3_v1,
+        cryptonight_av4_v1,
+        cryptonight_av1_v2,
+        cryptonight_av2_v2,
+        cryptonight_av3_v2,
+        cryptonight_av4_v2,
+
+        cryptonight_r_av1,
+        cryptonight_r_av2,
+        cryptonight_r_av3,
+        cryptonight_r_av4,
+
+#       ifndef XMRIG_NO_AEON
+        cryptonight_lite_av1_v0,
+        cryptonight_lite_av2_v0,
+        cryptonight_lite_av3_v0,
+        cryptonight_lite_av4_v0,
+        cryptonight_lite_av1_v1,
+        cryptonight_lite_av2_v1,
+        cryptonight_lite_av3_v1,
+        cryptonight_lite_av4_v1,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+#       else
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+#       endif
+    };
+
+#   ifndef NDEBUG
+    const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
+
+    cn_hash_fun func = func_table[index];
+
+    assert(index < sizeof(func_table) / sizeof(func_table[0]));
+    assert(func != NULL);
+
+    return func;
+#   else
+    return func_table[VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1];
+#   endif
+}
+
+
+bool cryptonight_init(int av)
+{
+    opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT;
+
+#   ifndef XMRIG_NO_ASM
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_INTEL]     = cryptonight_single_hash_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_RYZEN]     = cryptonight_single_hash_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_single_hash_asm_bulldozer;
+
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_INTEL]     = cryptonight_double_hash_asm;
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_RYZEN]     = cryptonight_double_hash_asm;
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_double_hash_asm;
+
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av1_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av1_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av1_asm_bulldozer;
+
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av2_asm_intel;
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av2_asm_intel;
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av2_asm_bulldozer;
+#   endif
+
+    return self_test();
+}
+
+
+static inline void do_blake_hash(const void* input, size_t len, char* output) {
+    blake256_hash((uint8_t*)output, input, len);
+}
+
+
+static inline void do_groestl_hash(const void* input, size_t len, char* output) {
+    groestl(input, len * 8, (uint8_t*)output);
+}
+
+
+static inline void do_jh_hash(const void* input, size_t len, char* output) {
+    jh_hash(32 * 8, input, 8 * len, (uint8_t*)output);
+}
+
+
+static inline void do_skein_hash(const void* input, size_t len, char* output) {
+    skein_hash(8 * 32, input, 8 * len, (uint8_t*)output);
+}
+
+
+void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+
+
+static inline enum Variant cryptonight_variant(uint8_t version)
+{
+    if (opt_variant != VARIANT_AUTO) {
+        return opt_variant;
+    }
+
+    if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
+        return VARIANT_1;
+    }
+
+    if (version >= 10) {
+        return VARIANT_4;
+    }
+
+    if (version >= 8) {
+        return VARIANT_2;
+    }
+
+    return version == 7 ? VARIANT_1 : VARIANT_0;
+}
+
+
+#ifndef BUILD_TEST
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+    uint32_t *nonceptr   = (uint32_t*) (((char*) blob) + 39);
+    enum Variant variant = cryptonight_variant(blob[0]);
+
+    do {
+        cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx);
+
+        (*hashes_done)++;
+
+        if (unlikely(hash[7] < target)) {
+            return 1;
+        }
+
+        (*nonceptr)++;
+    } while (likely(((*nonceptr) < max_nonce && !work_restart[thr_id].restart)));
+
+    return 0;
+}
+
+
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+    int rc               = 0;
+    uint32_t *nonceptr0  = (uint32_t*) (((char*) blob) + 39);
+    uint32_t *nonceptr1  = (uint32_t*) (((char*) blob) + 39 + blob_size);
+    enum Variant variant = cryptonight_variant(blob[0]);
+
+    do {
+        cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx);
+        (*hashes_done) += 2;
+
+        if (unlikely(hash[7] < target)) {
+            return rc |= 1;
+        }
+
+        if (unlikely(hash[15] < target)) {
+            return rc |= 2;
+        }
+
+        if (rc) {
+            break;
+        }
+
+        (*nonceptr0)++;
+        (*nonceptr1)++;
+    } while (likely(((*nonceptr0) < max_nonce && !work_restart[thr_id].restart)));
+
+    return rc;
+}
+#endif
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -4,8 +4,10 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -21,52 +23,59 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_H__
-#define __CRYPTONIGHT_H__
+#ifndef XMRIG_CRYPTONIGHT_H
+#define XMRIG_CRYPTONIGHT_H
+

 #include <stddef.h>
 #include <stdint.h>
+#include <stdbool.h>

-#define MEMORY          (1 << 21) /* 2 MiB */
-#define MEMORY_M128I    (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
-#define ITER            (1 << 20)
-#define AES_BLOCK_SIZE  16
-#define AES_KEY_SIZE    32 /*16*/
-#define INIT_SIZE_BLK   8
-#define INIT_SIZE_BYTE  (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128
-#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8

-#pragma pack(push, 1)
-union hash_state {
-  uint8_t b[200];
-  uint64_t w[25];
-};
-#pragma pack(pop)
+#include "options.h"

-#pragma pack(push, 1)
-union cn_slow_hash_state {
-    union hash_state hs;
-    struct {
-        uint8_t k[64];
-        uint8_t init[INIT_SIZE_BYTE];
-    };
-};
-#pragma pack(pop)
+
+#define MEMORY      2097152 /* 2 MiB */
+#define MEMORY_LITE 1048576 /* 1 MiB */
+
+
+#if defined _MSC_VER || defined XMRIG_ARM
+#define ABI_ATTRIBUTE
+#else
+#define ABI_ATTRIBUTE __attribute__((ms_abi))
+#endif
+
+
+struct cryptonight_ctx;
+typedef void(*cn_mainloop_fun_ms_abi)(struct cryptonight_ctx*) ABI_ATTRIBUTE;
+typedef void(*cn_mainloop_double_fun_ms_abi)(struct cryptonight_ctx*, struct cryptonight_ctx*) ABI_ATTRIBUTE;


 struct cryptonight_ctx {
-    union cn_slow_hash_state state;
-    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16)));
-    uint64_t a[2] __attribute__((aligned(16)));
-    uint64_t b[2] __attribute__((aligned(16)));
-    uint64_t c[2] __attribute__((aligned(16)));
+    uint8_t state[224] __attribute__((aligned(16)));
+    uint8_t *memory    __attribute__((aligned(16)));
+
+    uint8_t unused[40];
+    const uint32_t *saes_table;
+
+    cn_mainloop_fun_ms_abi generated_code;
+    cn_mainloop_double_fun_ms_abi generated_code_double;
+    uint64_t generated_code_height;
+    uint64_t generated_code_double_height;
+    uint64_t height;
 };


+typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+
+
 extern void (* const extra_hashes[4])(const void *, size_t, char *);

-void cryptonight_init(int variant);
-void cryptonight_hash(void* output, const void* input, size_t input_len);
-int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict pdata, const uint32_t *restrict ptarget, uint32_t max_nonce, unsigned long *restrict hashes_done, const char *memory, struct cryptonight_ctx *persistentctx);
+cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant);

-#endif /* __CRYPTONIGHT_H__ */
+bool cryptonight_init(int av);
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
+
+
+#endif /* XMRIG_CRYPTONIGHT_H */
--- a/algo/cryptonight/cryptonight_aesni.h
+++ b/algo/cryptonight/cryptonight_aesni.h
@@ -0,0 +1,274 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ *
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_AESNI_H
+#define XMRIG_CRYPTONIGHT_AESNI_H
+
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+
+#define aes_genkey_sub(imm8) \
+    __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, (imm8)); \
+    xout1  = _mm_shuffle_epi32(xout1, 0xFF); \
+    *xout0 = sl_xor(*xout0); \
+    *xout0 = _mm_xor_si128(*xout0, xout1); \
+    xout1  = _mm_aeskeygenassist_si128(*xout0, 0x00);\
+    xout1  = _mm_shuffle_epi32(xout1, 0xAA); \
+    *xout2 = sl_xor(*xout2); \
+    *xout2 = _mm_xor_si128(*xout2, xout1); \
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static inline __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+static inline void aes_genkey_sub1(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x1)
+}
+
+
+static inline void aes_genkey_sub2(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x2)
+}
+
+
+static inline void aes_genkey_sub4(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x4)
+}
+
+
+static inline void aes_genkey_sub8(__m128i* xout0, __m128i* xout2)
+{
+   aes_genkey_sub(0x8)
+}
+
+
+static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    *x0 = _mm_aesenc_si128(*x0, key);
+    *x1 = _mm_aesenc_si128(*x1, key);
+    *x2 = _mm_aesenc_si128(*x2, key);
+    *x3 = _mm_aesenc_si128(*x3, key);
+    *x4 = _mm_aesenc_si128(*x4, key);
+    *x5 = _mm_aesenc_si128(*x5, key);
+    *x6 = _mm_aesenc_si128(*x6, key);
+    *x7 = _mm_aesenc_si128(*x7, key);
+}
+
+
+static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = xout0;
+    *k1 = xout2;
+
+     aes_genkey_sub1(&xout0, &xout2);
+    *k2 = xout0;
+    *k3 = xout2;
+
+     aes_genkey_sub2(&xout0, &xout2);
+    *k4 = xout0;
+    *k5 = xout2;
+
+     aes_genkey_sub4(&xout0, &xout2);
+    *k6 = xout0;
+    *k7 = xout2;
+
+     aes_genkey_sub8(&xout0, &xout2);
+    *k8 = xout0;
+    *k9 = xout2;
+}
+
+
+static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xin0 = _mm_load_si128(input + 4);
+    xin1 = _mm_load_si128(input + 5);
+    xin2 = _mm_load_si128(input + 6);
+    xin3 = _mm_load_si128(input + 7);
+    xin4 = _mm_load_si128(input + 8);
+    xin5 = _mm_load_si128(input + 9);
+    xin6 = _mm_load_si128(input + 10);
+    xin7 = _mm_load_si128(input + 11);
+
+    for (size_t i = 0; __builtin_expect(i < MEMORY / sizeof(__m128i), 1); i += 8) {
+        aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+
+        _mm_store_si128(output + i + 0, xin0);
+        _mm_store_si128(output + i + 1, xin1);
+        _mm_store_si128(output + i + 2, xin2);
+        _mm_store_si128(output + i + 3, xin3);
+        _mm_store_si128(output + i + 4, xin4);
+        _mm_store_si128(output + i + 5, xin5);
+        _mm_store_si128(output + i + 6, xin6);
+        _mm_store_si128(output + i + 7, xin7);
+    }
+}
+
+
+static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm_load_si128(output + 4);
+    xout1 = _mm_load_si128(output + 5);
+    xout2 = _mm_load_si128(output + 6);
+    xout3 = _mm_load_si128(output + 7);
+    xout4 = _mm_load_si128(output + 8);
+    xout5 = _mm_load_si128(output + 9);
+    xout6 = _mm_load_si128(output + 10);
+    xout7 = _mm_load_si128(output + 11);
+
+    for (size_t i = 0; __builtin_expect(i < MEMORY / sizeof(__m128i), 1); i += 8)
+    {
+        xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+        xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+        xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+        xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+        xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+        xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+        xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+        xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+        aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+    }
+
+    _mm_store_si128(output + 4, xout0);
+    _mm_store_si128(output + 5, xout1);
+    _mm_store_si128(output + 6, xout2);
+    _mm_store_si128(output + 7, xout3);
+    _mm_store_si128(output + 8, xout4);
+    _mm_store_si128(output + 9, xout5);
+    _mm_store_si128(output + 10, xout6);
+    _mm_store_si128(output + 11, xout7);
+}
+
+
+#if defined(__x86_64__)
+#   define EXTRACT64(X) _mm_cvtsi128_si64(X)
+
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
+{
+    unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
+    *hi = r >> 64;
+    return (uint64_t) r;
+}
+#elif defined(__i386__)
+#   define HI32(X) \
+    _mm_srli_si128((X), 4)
+
+
+#   define EXTRACT64(X) \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+
+static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
+    // multiplier   = ab = a * 2^32 + b
+    // multiplicand = cd = c * 2^32 + d
+    // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+    uint64_t a = multiplier >> 32;
+    uint64_t b = multiplier & 0xFFFFFFFF;
+    uint64_t c = multiplicand >> 32;
+    uint64_t d = multiplicand & 0xFFFFFFFF;
+
+    //uint64_t ac = a * c;
+    uint64_t ad = a * d;
+    //uint64_t bc = b * c;
+    uint64_t bd = b * d;
+
+    uint64_t adbc = ad + (b * c);
+    uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+    // multiplier * multiplicand = product_hi * 2^64 + product_lo
+    uint64_t product_lo = bd + (adbc << 32);
+    uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+    *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+    return product_lo;
+}
+#endif
+
+
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_AESNI_H */
--- a/algo/cryptonight/cryptonight_av1.c
+++ b/algo/cryptonight/cryptonight_av1.c
@@ -0,0 +1,261 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = _mm_aesenc_si128(cx, ax0);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+#ifndef XMRIG_NO_ASM
+extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_mainloop_bulldozer_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1);
+
+
+void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_ivybridge_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_ryzen_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_single_hash_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_bulldozer_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    keccakf((uint64_t*) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+#endif
--- a/algo/cryptonight/cryptonight_av1_aesni.c
+++ b/algo/cryptonight/cryptonight_av1_aesni.c
@@ -1,216 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "crypto/c_keccak.h"
-
-
-static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
-{
-    __m128i tmp4;
-    *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
-    tmp4 = _mm_slli_si128(*tmp1, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
-}
-
-static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
-{
-    __m128i tmp2, tmp4;
-
-    tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
-    tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
-    tmp4 = _mm_slli_si128(*tmp3, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp2);
-}
-
-// Special thanks to Intel for helping me
-// with ExpandAESKey256() and its subroutines
-static inline void ExpandAESKey256(char *keybuf)
-{
-    __m128i tmp1, tmp2, tmp3, *keys;
-
-    keys = (__m128i *)keybuf;
-
-    tmp1 = _mm_load_si128((__m128i *)keybuf);
-    tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[2] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[3] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[4] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[5] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[6] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[7] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[8] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[9] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[10] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[11] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[12] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[13] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[14] = tmp1;
-}
-
-void cryptonight_av1_aesni(void *restrict output, const void *restrict input, const char *restrict memory, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *)input, 76, (uint8_t *) &ctx->state.hs, 200);
-    uint8_t ExpandedKey[256];
-    size_t i, j;
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx->state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *) memory;
-    expkey = (__m128i *)ExpandedKey;
-    xmminput = (__m128i *)ctx->text;
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
-    {
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-        _mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-        ctx->a[i] = ((uint64_t *)ctx->state.k)[i] ^  ((uint64_t *)ctx->state.k)[i+4];
-        ctx->b[i] = ((uint64_t *)ctx->state.k)[i+2] ^  ((uint64_t *)ctx->state.k)[i+6];
-    }
-
-    __m128i a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    __m128i b_x = _mm_load_si128((__m128i *) ctx->b);
-
-    uint64_t c[2] __attribute((aligned(16)));
-    uint64_t d[2] __attribute((aligned(16)));
-
-    for (i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i c_x = _mm_aesenc_si128(a_x, _mm_load_si128((__m128i *) ctx->a));
-        _mm_store_si128((__m128i *) c, c_x);
-
-        uint64_t *restrict d_ptr = (uint64_t *) &memory[c[0] & 0x1FFFF0];
-        _mm_store_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0], _mm_xor_si128(b_x, c_x));
-        b_x = c_x;
-
-        d[0] = d_ptr[0];
-        d[1] = d_ptr[1];
-
-        {
-            unsigned __int128 res = (unsigned __int128) c[0] * d[0];
-
-            d_ptr[0] = ctx->a[0] += res >> 64;
-            d_ptr[1] = ctx->a[1] += (uint64_t) res;
-        }
-
-        ctx->a[0] ^= d[0];
-        ctx->a[1] ^= d[1];
-
-        a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    }
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, &ctx->state.hs.b[32], AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) {
-        xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
-        xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
-        xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
-        xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
-        xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
-        xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
-        xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
-        xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
-
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-
-    }
-
-    memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-    keccakf((uint64_t *) &ctx->state.hs, 24);
-    extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-}
--- a/algo/cryptonight/cryptonight_av1_aesni32.c
+++ b/algo/cryptonight/cryptonight_av1_aesni32.c
@@ -1,239 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- 
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "crypto/c_keccak.h"
-
-
-static inline uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
-  // multiplier   = ab = a * 2^32 + b
-  // multiplicand = cd = c * 2^32 + d
-  // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
-  uint64_t a = multiplier >> 32;
-  uint64_t b = multiplier & 0xFFFFFFFF;
-  uint64_t c = multiplicand >> 32;
-  uint64_t d = multiplicand & 0xFFFFFFFF;
-
-  //uint64_t ac = a * c;
-  uint64_t ad = a * d;
-  //uint64_t bc = b * c;
-  uint64_t bd = b * d;
-
-  uint64_t adbc = ad + (b * c);
-  uint64_t adbc_carry = adbc < ad ? 1 : 0;
-
-  // multiplier * multiplicand = product_hi * 2^64 + product_lo
-  uint64_t product_lo = bd + (adbc << 32);
-  uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
-  *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
-
-  return product_lo;
-}
-
-
-static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
-{
-    __m128i tmp4;
-    *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
-    tmp4 = _mm_slli_si128(*tmp1, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
-}
-
-static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
-{
-    __m128i tmp2, tmp4;
-
-    tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
-    tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
-    tmp4 = _mm_slli_si128(*tmp3, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp2);
-}
-
-// Special thanks to Intel for helping me
-// with ExpandAESKey256() and its subroutines
-static inline void ExpandAESKey256(char *keybuf)
-{
-    __m128i tmp1, tmp2, tmp3, *keys;
-
-    keys = (__m128i *)keybuf;
-
-    tmp1 = _mm_load_si128((__m128i *)keybuf);
-    tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[2] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[3] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[4] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[5] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[6] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[7] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[8] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[9] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[10] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[11] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[12] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[13] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[14] = tmp1;
-}
-
-void cryptonight_av1_aesni32(void *restrict output, const void *restrict input, const char *restrict memory, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *)input, 76, (uint8_t *) &ctx->state.hs, 200);
-    uint8_t ExpandedKey[256];
-    size_t i, j;
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx->state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *) memory;
-    expkey = (__m128i *)ExpandedKey;
-    xmminput = (__m128i *)ctx->text;
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
-    {
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-        _mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-        ctx->a[i] = ((uint64_t *)ctx->state.k)[i] ^  ((uint64_t *)ctx->state.k)[i+4];
-        ctx->b[i] = ((uint64_t *)ctx->state.k)[i+2] ^  ((uint64_t *)ctx->state.k)[i+6];
-    }
-
-    __m128i a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    __m128i b_x = _mm_load_si128((__m128i *) ctx->b);
-
-    uint64_t c[2] __attribute((aligned(16)));
-    uint64_t d[2] __attribute((aligned(16)));
-    uint64_t hi;
-
-    for (i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i c_x = _mm_aesenc_si128(a_x, _mm_load_si128((__m128i *) ctx->a));
-        _mm_store_si128((__m128i *) c, c_x);
-
-        uint64_t *restrict d_ptr = (uint64_t *) &memory[c[0] & 0x1FFFF0];
-        _mm_store_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0], _mm_xor_si128(b_x, c_x));
-        b_x = c_x;
-
-        d[0] = d_ptr[0];
-        d[1] = d_ptr[1];
-
-        d_ptr[1] = ctx->a[1] += mul128(c[0], d[0], &hi);
-        d_ptr[0] = ctx->a[0] += hi;
-
-        ctx->a[0] ^= d[0];
-        ctx->a[1] ^= d[1];
-
-        a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    }
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, &ctx->state.hs.b[32], AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) {
-        xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
-        xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
-        xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
-        xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
-        xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
-        xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
-        xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
-        xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
-
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-
-    }
-
-    memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-    keccakf((uint64_t *) &ctx->state.hs, 24);
-    extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-}
--- a/algo/cryptonight/cryptonight_av2.c
+++ b/algo/cryptonight/cryptonight_av2.c
@@ -0,0 +1,304 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = _mm_aesenc_si128(cx0, ax0);
+        cx1 = _mm_aesenc_si128(cx1, ax1);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx0);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(1, cl, cx1);
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_av2_aesni_wolf.c
+++ b/algo/cryptonight/cryptonight_av2_aesni_wolf.c
@@ -1,237 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-
-#include "cryptonight.h"
-#include "crypto/c_keccak.h"
-
-
-static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
-{
-    __m128i tmp4;
-    *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
-    tmp4 = _mm_slli_si128(*tmp1, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
-}
-
-static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
-{
-    __m128i tmp2, tmp4;
-
-    tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
-    tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
-    tmp4 = _mm_slli_si128(*tmp3, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp2);
-}
-
-// Special thanks to Intel for helping me
-// with ExpandAESKey256() and its subroutines
-static inline void ExpandAESKey256(char *keybuf)
-{
-    __m128i tmp1, tmp2, tmp3, *keys;
-
-    keys = (__m128i *)keybuf;
-
-    tmp1 = _mm_load_si128((__m128i *)keybuf);
-    tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[2] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[3] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[4] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[5] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[6] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[7] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[8] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[9] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[10] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[11] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[12] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[13] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[14] = tmp1;
-}
-
-void cryptonight_av2_aesni_wolf(void *restrict output, const void *restrict input, const char *restrict memory, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input, 76, (uint8_t *) &ctx->state.hs, 200);
-    uint8_t ExpandedKey[256];
-    size_t i, j;
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx->state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *)memory;
-    expkey = (__m128i *)ExpandedKey;
-    xmminput = (__m128i *)ctx->text;
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
-    {
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-        _mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-        ctx->a[i] = ((uint64_t *)ctx->state.k)[i] ^  ((uint64_t *)ctx->state.k)[i+4];
-        ctx->b[i] = ((uint64_t *)ctx->state.k)[i+2] ^  ((uint64_t *)ctx->state.k)[i+6];
-    }
-
-    __m128i b_x = _mm_load_si128((__m128i *)ctx->b);
-    uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16)));
-    a[0] = ctx->a[0];
-    a[1] = ctx->a[1];
-
-    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
-    {
-        __m128i c_x = _mm_load_si128((__m128i *)&memory[a[0] & 0x1FFFF0]);
-        __m128i a_x = _mm_load_si128((__m128i *)a);
-        uint64_t c[2];
-        c_x = _mm_aesenc_si128(c_x, a_x);
-
-        _mm_store_si128((__m128i *)c, c_x);
-        __builtin_prefetch(&memory[c[0] & 0x1FFFF0], 0, 1);
-
-        b_x = _mm_xor_si128(b_x, c_x);
-        _mm_store_si128((__m128i *)&memory[a[0] & 0x1FFFF0], b_x);
-
-        uint64_t *nextblock = (uint64_t *)&memory[c[0] & 0x1FFFF0];
-        uint64_t b[2];
-        b[0] = nextblock[0];
-        b[1] = nextblock[1];
-
-        {
-          uint64_t hi, lo;
-         // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
-
-          __asm__("mulq %3\n\t"
-              : "=d" (hi),
-            "=a" (lo)
-              : "%a" (c[0]),
-            "rm" (b[0])
-              : "cc" );
-
-          a[0] += hi;
-          a[1] += lo;
-        }
-
-        uint64_t *dst = (uint64_t *) &memory[c[0] & 0x1FFFF0];
-        dst[0] = a[0];
-        dst[1] = a[1];
-
-        a[0] ^= b[0];
-        a[1] ^= b[1];
-        b_x = c_x;
-        __builtin_prefetch(&memory[a[0] & 0x1FFFF0], 0, 3);
-    }
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, &ctx->state.hs.b[32], AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
-    {
-        xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
-        xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
-        xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
-        xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
-        xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
-        xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
-        xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
-        xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
-
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-
-    }
-
-    memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-    keccakf((uint64_t *) &ctx->state.hs, 24);
-    extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-}
--- a/algo/cryptonight/cryptonight_av3.c
+++ b/algo/cryptonight/cryptonight_av3.c
@@ -0,0 +1,193 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = soft_aesenc(cx, ax0);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight/cryptonight_av3_aesni_bmi2.c
+++ b/algo/cryptonight/cryptonight_av3_aesni_bmi2.c
@@ -1,214 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "crypto/c_keccak.h"
-
-
-static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
-{
-    __m128i tmp4;
-    *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
-    tmp4 = _mm_slli_si128(*tmp1, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
-}
-
-static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
-{
-    __m128i tmp2, tmp4;
-
-    tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
-    tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
-    tmp4 = _mm_slli_si128(*tmp3, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4 = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp2);
-}
-
-// Special thanks to Intel for helping me
-// with ExpandAESKey256() and its subroutines
-static inline void ExpandAESKey256(char *keybuf)
-{
-    __m128i tmp1, tmp2, tmp3, *keys;
-
-    keys = (__m128i *)keybuf;
-
-    tmp1 = _mm_load_si128((__m128i *)keybuf);
-    tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[2] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[3] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[4] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[5] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[6] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[7] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[8] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[9] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[10] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[11] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[12] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[13] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[14] = tmp1;
-}
-
-void cryptonight_av3_aesni_bmi2(void *restrict output, const void *restrict input, const char *restrict memory, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input, 76, (uint8_t *) &ctx->state.hs, 200);
-    uint8_t ExpandedKey[256];
-    size_t i, j;
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx->state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *) memory;
-    expkey = (__m128i *)ExpandedKey;
-    xmminput = (__m128i *)ctx->text;
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
-    {
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-        _mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
-        _mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-        ctx->a[i] = ((uint64_t *)ctx->state.k)[i] ^  ((uint64_t *)ctx->state.k)[i+4];
-        ctx->b[i] = ((uint64_t *)ctx->state.k)[i+2] ^  ((uint64_t *)ctx->state.k)[i+6];
-    }
-
-    __m128i a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    __m128i b_x = _mm_load_si128((__m128i *) ctx->b);
-
-    uint64_t c[2] __attribute((aligned(16)));
-    uint64_t d[2] __attribute((aligned(16)));
-    uint64_t hi;
-
-    for (i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i c_x = _mm_aesenc_si128(a_x, _mm_load_si128((__m128i *) ctx->a));
-        _mm_store_si128((__m128i *) c, c_x);
-
-        uint64_t *restrict d_ptr = (uint64_t *) &memory[c[0] & 0x1FFFF0];
-        _mm_store_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0], _mm_xor_si128(b_x, c_x));
-        b_x = c_x;
-
-        d[0] = d_ptr[0];
-        d[1] = d_ptr[1];
-
-        d_ptr[1] = ctx->a[1] += _mulx_u64(c[0], d[0], &hi);
-        d_ptr[0] = ctx->a[0] += hi;
-
-        ctx->a[0] ^= d[0];
-        ctx->a[1] ^= d[1];
-
-        a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-
-    }
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, &ctx->state.hs.b[32], AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) {
-        xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
-        xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
-        xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
-        xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
-        xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
-        xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
-        xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
-        xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
-
-        for(j = 0; j < 10; j++)
-        {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-
-    }
-
-    memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-    keccakf((uint64_t *) &ctx->state.hs, 24);
-    extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-}
--- a/algo/cryptonight/cryptonight_av4.c
+++ b/algo/cryptonight/cryptonight_av4.c
@@ -0,0 +1,304 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_av4_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = soft_aesenc(cx0, ax0);
+        cx1 = soft_aesenc(cx1, ax1);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx0);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(1, cl, cx1);
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_av4_legacy.c
+++ b/algo/cryptonight/cryptonight_av4_legacy.c
@@ -1,151 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "compat.h"
-#include "crypto/c_keccak.h"
-#include "crypto/aesb.h"
-#include "crypto/oaes_lib.h"
-
-
-static inline uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
-  // multiplier   = ab = a * 2^32 + b
-  // multiplicand = cd = c * 2^32 + d
-  // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
-  uint64_t a = multiplier >> 32;
-  uint64_t b = multiplier & 0xFFFFFFFF;
-  uint64_t c = multiplicand >> 32;
-  uint64_t d = multiplicand & 0xFFFFFFFF;
-
-  //uint64_t ac = a * c;
-  uint64_t ad = a * d;
-  //uint64_t bc = b * c;
-  uint64_t bd = b * d;
-
-  uint64_t adbc = ad + (b * c);
-  uint64_t adbc_carry = adbc < ad ? 1 : 0;
-
-  // multiplier * multiplicand = product_hi * 2^64 + product_lo
-  uint64_t product_lo = bd + (adbc << 32);
-  uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
-  *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
-
-  return product_lo;
-}
-
-
-static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
-    uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
-    hi += ((uint64_t*) c)[0];
-
-    ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
-    ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
-    ((uint64_t*) dst)[0] = hi;
-    ((uint64_t*) dst)[1] = lo;
-}
-
-
-static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
-    ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
-    ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
-}
-
-
-static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
-    ((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
-    ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
-}
-
-
-void cryptonight_av4_legacy(void *restrict output, const void *restrict input, const char *restrict memory, struct cryptonight_ctx *restrict ctx) {
-    oaes_ctx *aes_ctx = (oaes_ctx*) oaes_alloc();
-    size_t i, j;
-    keccak((const uint8_t *)input, 76, (uint8_t *) &ctx->state.hs, 200);
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-
-    oaes_key_import_data(aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
-
-   for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], aes_ctx->key->exp_data);
-        aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], aes_ctx->key->exp_data);
-        memcpy((void *) &memory[i], ctx->text, INIT_SIZE_BYTE);
-    }
-
-    xor_blocks_dst(&ctx->state.k[0],  &ctx->state.k[32], (uint8_t*) ctx->a);
-    xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], (uint8_t*) ctx->b);
-
-    for (i = 0; likely(i < ITER / 4); ++i) {
-        /* Dependency chain: address -> read value ------+
-         * written value <-+ hard function (AES or MUL) <+
-         * next address  <-+
-         */
-        /* Iteration 1 */
-        j = ctx->a[0] & 0x1FFFF0;
-        aesb_single_round((const uint8_t*) &memory[j], (uint8_t *) ctx->c, (const uint8_t *) ctx->a);
-        xor_blocks_dst((const uint8_t*) ctx->c, (const uint8_t*) ctx->b, (uint8_t*) &memory[j]);
-        /* Iteration 2 */
-        mul_sum_xor_dst((const uint8_t*) ctx->c, (uint8_t*) ctx->a, (uint8_t*) &memory[ctx->c[0] & 0x1FFFF0]);
-        /* Iteration 3 */
-        j = ctx->a[0] & 0x1FFFF0;
-        aesb_single_round(&memory[j], (uint8_t *) ctx->b, (uint8_t *) ctx->a);
-        xor_blocks_dst((const uint8_t*) ctx->b, (const uint8_t*) ctx->c, (uint8_t*) &memory[j]);
-        /* Iteration 4 */
-        mul_sum_xor_dst((const uint8_t*) ctx->b, (uint8_t*) ctx->a, (uint8_t*) &memory[ctx->b[0] & 0x1FFFF0]);
-    }
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    oaes_key_import_data(aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
-
-    for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-        xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &memory[i + 0 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &memory[i + 1 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &memory[i + 2 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &memory[i + 3 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &memory[i + 4 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &memory[i + 5 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &memory[i + 6 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-        xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &memory[i + 7 * AES_BLOCK_SIZE]);
-        aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], aes_ctx->key->exp_data);
-    }
-
-    memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-    keccakf((uint64_t *) &ctx->state.hs, 24);
-    extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-    oaes_free((OAES_CTX **) &aes_ctx);
-}
--- a/algo/cryptonight/cryptonight_av5_aesni_experimental.c
+++ b/algo/cryptonight/cryptonight_av5_aesni_experimental.c
@@ -1,248 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "crypto/c_keccak.h"
-
-
-static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
-{
-    __m128i tmp4;
-    *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
-    tmp4  = _mm_slli_si128(*tmp1, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4  = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    tmp4  = _mm_slli_si128(tmp4, 0x04);
-    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
-    *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
-}
-
-static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
-{
-    __m128i tmp2, tmp4;
-
-    tmp4  = _mm_aeskeygenassist_si128(*tmp1, 0x00);
-    tmp2  = _mm_shuffle_epi32(tmp4, 0xAA);
-    tmp4  = _mm_slli_si128(*tmp3, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4  = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    tmp4  = _mm_slli_si128(tmp4, 0x04);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
-    *tmp3 = _mm_xor_si128(*tmp3, tmp2);
-}
-
-// Special thanks to Intel for helping me
-// with ExpandAESKey256() and its subroutines
-static inline void ExpandAESKey256(char *keybuf)
-{
-    __m128i tmp1, tmp2, tmp3, *keys;
-
-    keys = (__m128i *)keybuf;
-
-    tmp1 = _mm_load_si128((__m128i *)keybuf);
-    tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[2] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[3] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[4] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[5] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[6] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[7] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[8] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[9] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[10] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[11] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[12] = tmp1;
-    ExpandAESKey256_sub2(&tmp1, &tmp3);
-    keys[13] = tmp3;
-
-    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
-    ExpandAESKey256_sub1(&tmp1, &tmp2);
-    keys[14] = tmp1;
-}
-
-void cryptonight_av5_aesni_experimental(void *restrict output, const void *restrict input, const char *restrict memory, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *)input, 76, (uint8_t *) &ctx->state.hs, 200);
-    uint8_t ExpandedKey[256];
-    size_t i, j;
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx->state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *) memory;
-    expkey     = (__m128i *) ExpandedKey;
-    xmminput   = (__m128i *)ctx->text;
-
-    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
-    _mm_prefetch(xmminput,     _MM_HINT_T0 );
-    _mm_prefetch(xmminput + 4, _MM_HINT_T0 );
-
-    for (i = 0; i < 64; i += 16) {
-       _mm_prefetch(longoutput + i,      _MM_HINT_T0);
-       _mm_prefetch(longoutput + i +  4, _MM_HINT_T0);
-       _mm_prefetch(longoutput + i +  8, _MM_HINT_T0);
-       _mm_prefetch(longoutput + i + 12, _MM_HINT_T0);
-    }
-
-    _mm_prefetch(expkey,     _MM_HINT_T0);
-    _mm_prefetch(expkey + 4, _MM_HINT_T0);
-    _mm_prefetch(expkey + 8, _MM_HINT_T0);
-
-    for (i = 0; __builtin_expect(i < MEMORY_M128I, 1); i += INIT_SIZE_M128I) {
-        __builtin_prefetch(longoutput + i + 64, 1, 0);
-        __builtin_prefetch(longoutput + i + 68, 1, 0);
-
-        for(j = 0; j < 10; j++) {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-
-    _mm_store_si128(&(longoutput[i     ]), xmminput[0]);
-    _mm_store_si128(&(longoutput[i + 1 ]), xmminput[1]);
-    _mm_store_si128(&(longoutput[i + 2 ]), xmminput[2]);
-    _mm_store_si128(&(longoutput[i + 3 ]), xmminput[3]);
-    _mm_store_si128(&(longoutput[i + 4 ]), xmminput[4]);
-    _mm_store_si128(&(longoutput[i + 5 ]), xmminput[5]);
-    _mm_store_si128(&(longoutput[i + 6 ]), xmminput[6]);
-    _mm_store_si128(&(longoutput[i + 7 ]), xmminput[7]);
-    }
-
-    ctx->a[0] = ((uint64_t *) ctx->state.k)[0] ^ ((uint64_t *) ctx->state.k)[4];
-    ctx->b[0] = ((uint64_t *) ctx->state.k)[2] ^ ((uint64_t *) ctx->state.k)[6];
-    ctx->a[1] = ((uint64_t *) ctx->state.k)[1] ^ ((uint64_t *) ctx->state.k)[5];
-    ctx->b[1] = ((uint64_t *) ctx->state.k)[3] ^ ((uint64_t *) ctx->state.k)[7];
-
-    __m128i a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    __m128i b_x = _mm_load_si128((__m128i *) ctx->b);
-
-    uint64_t c[2] __attribute((aligned(16)));
-    uint64_t d[2] __attribute((aligned(16)));
-
-    for (i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i c_x = _mm_aesenc_si128(a_x, _mm_load_si128((__m128i *) ctx->a));
-        _mm_store_si128((__m128i *) c, c_x);
-
-        uint64_t *restrict d_ptr = (uint64_t *) &memory[c[0] & 0x1FFFF0];
-        _mm_store_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0], _mm_xor_si128(b_x, c_x));
-        b_x = c_x;
-
-        d[0] = d_ptr[0];
-        d[1] = d_ptr[1];
-
-        {
-            unsigned __int128 res = (unsigned __int128) c[0] * d[0];
-
-            d_ptr[0] = ctx->a[0] += res >> 64;
-            d_ptr[1] = ctx->a[1] += (uint64_t) res;
-        }
-
-        ctx->a[0] ^= d[0];
-        ctx->a[1] ^= d[1];
-
-        a_x = _mm_load_si128((__m128i *) &memory[ctx->a[0] & 0x1FFFF0]);
-    }
-
-    memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, &ctx->state.hs.b[32], AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
-
-    _mm_prefetch(xmminput,     _MM_HINT_T0 );
-    _mm_prefetch(xmminput + 4, _MM_HINT_T0 );
-
-    for (i = 0; i < 64; i += 16) {
-       _mm_prefetch(longoutput + i,      _MM_HINT_T0);
-       _mm_prefetch(longoutput + i +  4, _MM_HINT_T0);
-       _mm_prefetch(longoutput + i +  8, _MM_HINT_T0);
-       _mm_prefetch(longoutput + i + 12, _MM_HINT_T0);
-    }
-
-    _mm_prefetch(expkey,     _MM_HINT_T0);
-    _mm_prefetch(expkey + 4, _MM_HINT_T0);
-    _mm_prefetch(expkey + 8, _MM_HINT_T0);
-
-    for (i = 0; __builtin_expect(i < MEMORY_M128I, 1); i += INIT_SIZE_M128I) {
-        _mm_prefetch(longoutput + i + 64, _MM_HINT_T0);
-        _mm_prefetch(longoutput + i + 68, _MM_HINT_T0);
-
-        xmminput[0] = _mm_xor_si128(longoutput[i    ], xmminput[0]);
-        xmminput[1] = _mm_xor_si128(longoutput[i + 1], xmminput[1]);
-        xmminput[2] = _mm_xor_si128(longoutput[i + 2], xmminput[2]);
-        xmminput[3] = _mm_xor_si128(longoutput[i + 3], xmminput[3]);
-        xmminput[4] = _mm_xor_si128(longoutput[i + 4], xmminput[4]);
-        xmminput[5] = _mm_xor_si128(longoutput[i + 5], xmminput[5]);
-        xmminput[6] = _mm_xor_si128(longoutput[i + 6], xmminput[6]);
-        xmminput[7] = _mm_xor_si128(longoutput[i + 7], xmminput[7]);
-
-        for(j = 0; j < 10; j++) {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-            xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-            xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-            xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-            xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-            xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-            xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-            xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-        }
-
-    }
-
-    memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-    keccakf((uint64_t *) &ctx->state.hs, 24);
-    extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-}
--- a/algo/cryptonight/cryptonight_common.c
+++ b/algo/cryptonight/cryptonight_common.c
@@ -1,142 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#include <stdlib.h>
-
-#ifndef BUILD_TEST
-# include "xmrig.h"
-#endif
-
-#include "crypto/c_groestl.h"
-#include "crypto/c_blake256.h"
-#include "crypto/c_jh.h"
-#include "crypto/c_skein.h"
-#include "cryptonight.h"
-#include "options.h"
-
-
-#if defined(__x86_64__)
-  void cryptonight_av1_aesni(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-  void cryptonight_av2_aesni_wolf(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-  void cryptonight_av3_aesni_bmi2(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-  void cryptonight_av5_aesni_experimental(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-#elif defined(__i386__)
-  void cryptonight_av1_aesni32(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-#endif
-
-void cryptonight_av4_legacy(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-
-void (*cryptonight_hash_ctx)(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx) = NULL;
-
-
-void cryptonight_init(int variant)
-{
-    switch (variant) {
-        #if defined(__x86_64__)
-        case XMR_VARIANT_AESNI:
-            cryptonight_hash_ctx = cryptonight_av1_aesni;
-            break;
-
-        case XMR_VARIANT_AESNI_WOLF:
-            cryptonight_hash_ctx = cryptonight_av2_aesni_wolf;
-            break;
-
-        case XMR_VARIANT_AESNI_BMI2:
-            cryptonight_hash_ctx = cryptonight_av3_aesni_bmi2;
-            break;
-
-        case XMR_VARIANT_EXPERIMENTAL:
-            cryptonight_hash_ctx = cryptonight_av5_aesni_experimental;
-            break;
-        #elif defined(__i386__)
-        case XMR_VARIANT_AESNI:
-            cryptonight_hash_ctx = cryptonight_av1_aesni32;
-            break;
-        #endif
-
-        case XMR_VARIANT_LEGACY:
-             cryptonight_hash_ctx = cryptonight_av4_legacy;
-             break;
-
-        default:
-            break;
-    }
-
-}
-
-
-static inline void do_blake_hash(const void* input, size_t len, char* output) {
-    blake256_hash((uint8_t*)output, input, len);
-}
-
-
-static inline void do_groestl_hash(const void* input, size_t len, char* output) {
-    groestl(input, len * 8, (uint8_t*)output);
-}
-
-
-static inline void do_jh_hash(const void* input, size_t len, char* output) {
-    jh_hash(32 * 8, input, 8 * len, (uint8_t*)output);
-}
-
-
-static inline void do_skein_hash(const void* input, size_t len, char* output) {
-    skein_hash(8 * 32, input, 8 * len, (uint8_t*)output);
-}
-
-
-void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
-
-
-void cryptonight_hash(void* output, const void* input, size_t len) {
-    uint8_t *memory __attribute((aligned(16))) = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-    cryptonight_hash_ctx(output, input, memory, ctx);
-
-    free(memory);
-    free(ctx);
-}
-
-
-#ifndef BUILD_TEST
-int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict pdata, const uint32_t *restrict ptarget, uint32_t max_nonce, unsigned long *restrict hashes_done, const char *restrict memory, struct cryptonight_ctx *persistentctx) {
-    uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
-    uint32_t n = *nonceptr - 1;
-    const uint32_t first_nonce = n + 1;
-
-    do {
-        *nonceptr = ++n;
-        cryptonight_hash_ctx(hash, pdata, memory, persistentctx);
-
-        if (unlikely(hash[7] < ptarget[7])) {
-            *hashes_done = n - first_nonce + 1;
-            return true;
-        }
-    } while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
-
-    *hashes_done = n - first_nonce + 1;
-    return 0;
-}
-#endif
--- a/algo/cryptonight/cryptonight_monero.h
+++ b/algo/cryptonight/cryptonight_monero.h
@@ -0,0 +1,150 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_MONERO_H
+#define XMRIG_CRYPTONIGHT_MONERO_H
+
+
+#include <fenv.h>
+#include <math.h>
+#include <stdint.h>
+#include <x86intrin.h>
+
+
+static inline __m128i int_sqrt_v2(const uint64_t n0)
+{
+    __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+    x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+    uint64_t r = (uint64_t)(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+    const uint64_t s = r >> 20;
+    r >>= 19;
+
+    uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+#   if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64))
+    _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#   else
+    if (x2 < n0) ++r;
+#   endif
+
+    return _mm_cvtsi64_si128(r);
+}
+
+
+#   define VARIANT1_INIT(part) \
+    uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \
+                               *((const uint64_t*)(ctx[part]->state) + 24)); \
+
+#   define VARIANT2_INIT(part) \
+    __m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \
+    __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]);
+
+#ifdef _MSC_VER
+#   define VARIANT2_SET_ROUNDING_MODE() { _control87(RC_DOWN, MCW_RC); }
+#else
+#   define VARIANT2_SET_ROUNDING_MODE() { fesetround(FE_DOWNWARD); }
+#endif
+
+#   define VARIANT2_INTEGER_MATH(part, cl, cx) \
+    { \
+        const uint64_t sqrt_result = (uint64_t)(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
+        const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
+        cl ^= (uint64_t)(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
+        const uint32_t d = (uint32_t)(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
+        const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+        const uint64_t division_result = (uint32_t)(cx_1 / d) + ((cx_1 % d) << 32); \
+        division_result_xmm_##part = _mm_cvtsi64_si128((int64_t)(division_result)); \
+        sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
+    }
+
+#   define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
+    { \
+        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    }
+
+#   define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \
+    { \
+        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+        _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
+    }
+
+#   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
+    { \
+        const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
+        lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    }
+
+
+#ifndef NOINLINE
+#ifdef __GNUC__
+#define NOINLINE __attribute__ ((noinline))
+#elif _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+#endif
+
+#include "variant4_random_math.h"
+
+#define VARIANT4_RANDOM_MATH_INIT(part) \
+  uint32_t r##part[9]; \
+  struct V4_Instruction code##part[256]; \
+  { \
+    r##part[0] = (uint32_t)(h##part[12]); \
+    r##part[1] = (uint32_t)(h##part[12] >> 32); \
+    r##part[2] = (uint32_t)(h##part[13]); \
+    r##part[3] = (uint32_t)(h##part[13] >> 32); \
+  } \
+  v4_random_math_init(code##part, ctx[part]->height);
+
+#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
+  { \
+    cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
+    r##part[4] = (uint32_t)(al); \
+    r##part[5] = (uint32_t)(ah); \
+    r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \
+    r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \
+    r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
+    v4_random_math(code##part, r##part); \
+  }
+
+#endif /* XMRIG_CRYPTONIGHT_MONERO_H */
--- a/algo/cryptonight/cryptonight_r_av1.c
+++ b/algo/cryptonight/cryptonight_r_av1.c
@@ -0,0 +1,143 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = _mm_aesenc_si128(cx, ax0);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+#ifndef XMRIG_NO_ASM
+void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+
+
+void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    ctx[0]->generated_code(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    ctx[0]->generated_code(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+#endif
--- a/algo/cryptonight/cryptonight_r_av2.c
+++ b/algo/cryptonight/cryptonight_r_av2.c
@@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = _mm_aesenc_si128(cx0, ax0);
+        cx1 = _mm_aesenc_si128(cx1, ax1);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+#ifndef XMRIG_NO_ASM
+void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+
+
+void cryptonight_r_av2_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_INTEL);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    ctx[0]->generated_code_double(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    keccakf((uint64_t *) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_r_av2_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_BULLDOZER);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    ctx[0]->generated_code_double(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    keccakf((uint64_t *) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+#endif
--- a/algo/cryptonight/cryptonight_r_av3.c
+++ b/algo/cryptonight/cryptonight_r_av3.c
@@ -0,0 +1,112 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+#ifndef XMRIG_NO_ASM
+void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+#endif
+
+
+void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+#   ifndef XMRIG_NO_ASM
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_soft_aes_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_NONE);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    ctx[0]->saes_table = (const uint32_t*)saes_table;
+    ctx[0]->generated_code(ctx[0]);
+#   else
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = soft_aesenc(cx, ax0);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+#   endif
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight/cryptonight_r_av4.c
+++ b/algo/cryptonight/cryptonight_r_av4.c
@@ -0,0 +1,143 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = soft_aesenc(cx0, ax0);
+        cx1 = soft_aesenc(cx1, ax1);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_softaes.h
+++ b/algo/cryptonight/cryptonight_softaes.h
@@ -0,0 +1,255 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_SOFTAES_H
+#define XMRIG_CRYPTONIGHT_SOFTAES_H
+
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+
+#include "crypto/soft_aes.h"
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static inline __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
+{
+    __m128i xout1 = soft_aeskeygenassist(*xout2, rcon);
+    xout1  = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    *xout0 = sl_xor(*xout0);
+    *xout0 = _mm_xor_si128(*xout0, xout1);
+    xout1  = soft_aeskeygenassist(*xout0, 0x00);
+    xout1  = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    *xout2 = sl_xor(*xout2);
+    *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+
+static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    *x0 = soft_aesenc(*x0, key);
+    *x1 = soft_aesenc(*x1, key);
+    *x2 = soft_aesenc(*x2, key);
+    *x3 = soft_aesenc(*x3, key);
+    *x4 = soft_aesenc(*x4, key);
+    *x5 = soft_aesenc(*x5, key);
+    *x6 = soft_aesenc(*x6, key);
+    *x7 = soft_aesenc(*x7, key);
+}
+
+
+static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = xout0;
+    *k1 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x1);
+    *k2 = xout0;
+    *k3 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x2);
+    *k4 = xout0;
+    *k5 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x4);
+    *k6 = xout0;
+    *k7 = xout2;
+
+    aes_genkey_sub(&xout0, &xout2, 0x8);
+    *k8 = xout0;
+    *k9 = xout2;
+}
+
+
+static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xin0 = _mm_load_si128(input + 4);
+    xin1 = _mm_load_si128(input + 5);
+    xin2 = _mm_load_si128(input + 6);
+    xin3 = _mm_load_si128(input + 7);
+    xin4 = _mm_load_si128(input + 8);
+    xin5 = _mm_load_si128(input + 9);
+    xin6 = _mm_load_si128(input + 10);
+    xin7 = _mm_load_si128(input + 11);
+
+    for (size_t i = 0; i < MEMORY / sizeof(__m128i); i += 8) {
+        aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+        aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+
+        _mm_store_si128(output + i + 0, xin0);
+        _mm_store_si128(output + i + 1, xin1);
+        _mm_store_si128(output + i + 2, xin2);
+        _mm_store_si128(output + i + 3, xin3);
+        _mm_store_si128(output + i + 4, xin4);
+        _mm_store_si128(output + i + 5, xin5);
+        _mm_store_si128(output + i + 6, xin6);
+        _mm_store_si128(output + i + 7, xin7);
+    }
+}
+
+
+static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+{
+    // This is more than we have registers, compiler will assign 2 keys on the stack
+    __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm_load_si128(output + 4);
+    xout1 = _mm_load_si128(output + 5);
+    xout2 = _mm_load_si128(output + 6);
+    xout3 = _mm_load_si128(output + 7);
+    xout4 = _mm_load_si128(output + 8);
+    xout5 = _mm_load_si128(output + 9);
+    xout6 = _mm_load_si128(output + 10);
+    xout7 = _mm_load_si128(output + 11);
+
+    for (size_t i = 0; __builtin_expect(i < MEMORY / sizeof(__m128i), 1); i += 8)
+    {
+        xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+        xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+        xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+        xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+        xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+        xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+        xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+        xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+        aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+    }
+
+    _mm_store_si128(output + 4, xout0);
+    _mm_store_si128(output + 5, xout1);
+    _mm_store_si128(output + 6, xout2);
+    _mm_store_si128(output + 7, xout3);
+    _mm_store_si128(output + 8, xout4);
+    _mm_store_si128(output + 9, xout5);
+    _mm_store_si128(output + 10, xout6);
+    _mm_store_si128(output + 11, xout7);
+}
+
+
+#if defined(__x86_64__)
+#   define EXTRACT64(X) _mm_cvtsi128_si64(X)
+
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
+{
+    unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
+    *hi = r >> 64;
+    return (uint64_t) r;
+}
+#elif defined(__i386__)
+#   define HI32(X) \
+    _mm_srli_si128((X), 4)
+
+
+#   define EXTRACT64(X) \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+    ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+
+inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
+    // multiplier   = ab = a * 2^32 + b
+    // multiplicand = cd = c * 2^32 + d
+    // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+    uint64_t a = multiplier >> 32;
+    uint64_t b = multiplier & 0xFFFFFFFF;
+    uint64_t c = multiplicand >> 32;
+    uint64_t d = multiplicand & 0xFFFFFFFF;
+
+    //uint64_t ac = a * c;
+    uint64_t ad = a * d;
+    //uint64_t bc = b * c;
+    uint64_t bd = b * d;
+
+    uint64_t adbc = ad + (b * c);
+    uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+    // multiplier * multiplicand = product_hi * 2^64 + product_lo
+    uint64_t product_lo = bd + (adbc << 32);
+    uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+    *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+    return product_lo;
+}
+#endif
+
+
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_SOFTAES_H */
--- a/algo/cryptonight/cryptonight_test.h
+++ b/algo/cryptonight/cryptonight_test.h
@@ -0,0 +1,129 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_TEST_H
+#define XMRIG_CRYPTONIGHT_TEST_H
+
+
+#include <stdint.h>
+
+
+const static uint8_t test_input[152] = {
+    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
+    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
+    0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
+    0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
+    0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
+    0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
+    0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
+    0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
+    0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
+    0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02
+};
+
+
+const static uint8_t test_output_v0[64] = {
+    0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
+    0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
+    0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
+    0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F
+};
+
+
+// Cryptonight variant 1 (Monero v7)
+const static uint8_t test_output_v1[64] = {
+    0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
+    0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
+    0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
+    0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22
+};
+
+
+// Cryptonight variant 2 (Monero v8)
+const static uint8_t test_output_v2[64] = {
+    0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14,
+    0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21,
+    0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89,
+    0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78
+};
+
+
+struct cn_r_test_input_data
+{
+    uint64_t height;
+    size_t size;
+    uint8_t data[64];
+};
+
+
+const static struct cn_r_test_input_data cn_r_test_input[] = {
+    { 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } },
+    { 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } },
+    { 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } },
+    { 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } },
+    { 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } },
+    { 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } },
+    { 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } },
+    { 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } },
+    { 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } },
+    { 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } },
+};
+
+
+// "cn/r"
+const static uint8_t test_output_r[] = {
+    0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc,
+    0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66,
+    0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab,
+    0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb,
+    0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02,
+    0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef,
+    0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd,
+    0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4,
+    0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3,
+    0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e,
+};
+
+
+#ifndef XMRIG_NO_AEON
+const static uint8_t test_output_v0_lite[64] = {
+    0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
+    0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
+    0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
+    0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD
+};
+
+
+// AEON v7
+const static uint8_t test_output_v1_lite[64] = {
+    0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
+    0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
+    0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
+    0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F
+};
+#endif
+
+
+#endif /* XMRIG_CRYPTONIGHT_TEST_H */
--- a/algo/cryptonight/variant4_random_math.h
+++ b/algo/cryptonight/variant4_random_math.h
@@ -0,0 +1,449 @@
+#ifndef VARIANT4_RANDOM_MATH_H
+#define VARIANT4_RANDOM_MATH_H
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+
+#include "crypto/c_blake256.h"
+
+
+enum V4_Settings
+{
+    // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+    TOTAL_LATENCY = 15 * 3,
+    
+    // Always generate at least 60 instructions
+    NUM_INSTRUCTIONS_MIN = 60,
+
+    // Never generate more than 70 instructions (final RET instruction doesn't count here)
+    NUM_INSTRUCTIONS_MAX = 70,
+
+    // Available ALUs for MUL
+    // Modern CPUs typically have only 1 ALU which can do multiplications
+    ALU_COUNT_MUL = 1,
+
+    // Total available ALUs
+    // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+    ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+    MUL,    // a*b
+    ADD,    // a+b + C, C is an unsigned 32-bit constant
+    SUB,    // a-b
+    ROR,    // rotate right "a" by "b & 31" bits
+    ROL,    // rotate left "a" by "b & 31" bits
+    XOR,    // a^b
+    RET,    // finish execution
+    V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+    V4_OPCODE_BITS = 3,
+    V4_DST_INDEX_BITS = 2,
+    V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+    uint8_t opcode;
+    uint8_t dst_index;
+    uint8_t src_index;
+    uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#ifdef __GNUC__
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif _MSC_VER
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#ifdef __GNUC__
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif _MSC_VER
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+#define SWAP32LE(x) x
+#define SWAP64LE(x) x
+#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+//template<typename v4_reg>
+static void v4_random_math(const struct V4_Instruction* code, uint32_t r[9])
+{
+#define REG_BITS 32
+#define V4_EXEC(i) \
+    { \
+        const struct V4_Instruction* op = code + i; \
+        const uint32_t src = r[op->src_index]; \
+        uint32_t *dst = r + op->dst_index; \
+        switch (op->opcode) \
+        { \
+        case MUL: \
+            *dst *= src; \
+            break; \
+        case ADD: \
+            *dst += src + op->C; \
+            break; \
+        case SUB: \
+            *dst -= src; \
+            break; \
+        case ROR: \
+            { \
+                const uint32_t shift = src % REG_BITS; \
+                *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+            } \
+            break; \
+        case ROL: \
+            { \
+                const uint32_t shift = src % REG_BITS; \
+                *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+            } \
+            break; \
+        case XOR: \
+            *dst ^= src; \
+            break; \
+        case RET: \
+            return; \
+        default: \
+            UNREACHABLE_CODE; \
+            break; \
+        } \
+    }
+
+#define V4_EXEC_10(j) \
+    V4_EXEC(j + 0) \
+    V4_EXEC(j + 1) \
+    V4_EXEC(j + 2) \
+    V4_EXEC(j + 3) \
+    V4_EXEC(j + 4) \
+    V4_EXEC(j + 5) \
+    V4_EXEC(j + 6) \
+    V4_EXEC(j + 7) \
+    V4_EXEC(j + 8) \
+    V4_EXEC(j + 9)
+
+    // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+    // I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+    //
+    // 60      27960
+    // 61      105054
+    // 62      2452759
+    // 63      5115997
+    // 64      1022269
+    // 65      1109635
+    // 66      153145
+    // 67      8550
+    // 68      4529
+    // 69      102
+
+    // Unroll 70 instructions here
+    V4_EXEC_10(0);      // instructions 0-9
+    V4_EXEC_10(10);     // instructions 10-19
+    V4_EXEC_10(20);     // instructions 20-29
+    V4_EXEC_10(30);     // instructions 30-39
+    V4_EXEC_10(40);     // instructions 40-49
+    V4_EXEC_10(50);     // instructions 50-59
+    V4_EXEC_10(60);     // instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+#undef REG_BITS
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+    if (*data_index + bytes_needed > data_size)
+    {
+        hash_extra_blake(data, data_size, (char*) data);
+        *data_index = 0;
+    }
+}
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+    // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+    // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+    //
+    // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+    // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+    // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+    // Source: https://www.agner.org/optimize/instruction_tables.pdf
+    const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+    // Instruction latencies for theoretical ASIC implementation
+    const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+    // Available ALUs for each instruction
+    const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+    int8_t data[32];
+    memset(data, 0, sizeof(data));
+    uint64_t tmp = SWAP64LE(height);
+    memcpy(data, &tmp, sizeof(uint64_t));
+    data[20] = -38;
+
+    // Set data_index past the last byte in data
+    // to trigger full data update with blake hash
+    // before we start using it
+    size_t data_index = sizeof(data);
+
+    int code_size;
+
+    // There is a small chance (1.8%) that register R8 won't be used in the generated program
+    // So we keep track of it and try again if it's not used
+    bool r8_used;
+    do {
+        int latency[9];
+        int asic_latency[9];
+
+        // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+        // byte 0: current value of the destination register
+        // byte 1: instruction opcode
+        // byte 2: current value of the source register
+        //
+        // Registers R4-R8 are constant and are treated as having the same value because when we do
+        // the same operation twice with two constant source registers, it can be optimized into a single operation
+        uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+        bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+        bool is_rotation[V4_INSTRUCTION_COUNT];
+        bool rotated[4];
+        int rotate_count = 0;
+
+        memset(latency, 0, sizeof(latency));
+        memset(asic_latency, 0, sizeof(asic_latency));
+        memset(alu_busy, 0, sizeof(alu_busy));
+        memset(is_rotation, 0, sizeof(is_rotation));
+        memset(rotated, 0, sizeof(rotated));
+        is_rotation[ROR] = true;
+        is_rotation[ROL] = true;
+
+        int num_retries = 0;
+        code_size = 0;
+
+        int total_iterations = 0;
+        r8_used = false;
+
+        // Generate random code to achieve minimal required latency for our abstract CPU
+        // Try to get this latency for all 4 registers
+        while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+        {
+            // Fail-safe to guarantee loop termination
+            ++total_iterations;
+            if (total_iterations > 256)
+                break;
+
+            check_data(&data_index, 1, data, sizeof(data));
+
+            const uint8_t c = ((uint8_t*)data)[data_index++];
+
+            // MUL = opcodes 0-2
+            // ADD = opcode 3
+            // SUB = opcode 4
+            // ROR/ROL = opcode 5, shift direction is selected randomly
+            // XOR = opcodes 6-7
+            uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+            if (opcode == 5)
+            {
+                check_data(&data_index, 1, data, sizeof(data));
+                opcode = (data[data_index++] >= 0) ? ROR : ROL;
+            }
+            else if (opcode >= 6)
+            {
+                opcode = XOR;
+            }
+            else
+            {
+                opcode = (opcode <= 2) ? MUL : (opcode - 2);
+            }
+
+            uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+            uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+            const int a = dst_index;
+            int b = src_index;
+
+            // Don't do ADD/SUB/XOR with the same register
+            if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+            {
+                // a is always < 4, so we don't need to check bounds here
+                b = 8;
+                src_index = b;
+            }
+
+            // Don't do rotation with the same destination twice because it's equal to a single rotation
+            if (is_rotation[opcode] && rotated[a])
+            {
+                continue;
+            }
+
+            // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+            // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+            // 2xXOR(a, b) = NOP
+            if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+            {
+                continue;
+            }
+
+            // Find which ALU is available (and when) for this instruction
+            int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+            int alu_index = -1;
+            while (next_latency < TOTAL_LATENCY)
+            {
+                for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+                {
+                    if (!alu_busy[next_latency][i])
+                    {
+                        // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+                        if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+                        {
+                            continue;
+                        }
+
+                        // Rotation can only start when previous rotation is finished, so do an additional availability check
+                        if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+                        {
+                            continue;
+                        }
+
+                        alu_index = i;
+                        break;
+                    }
+                }
+                if (alu_index >= 0)
+                {
+                    break;
+                }
+                ++next_latency;
+            }
+
+            // Don't generate instructions that leave some register unchanged for more than 7 cycles
+            if (next_latency > latency[a] + 7)
+            {
+                continue;
+            }
+
+            next_latency += op_latency[opcode];
+
+            if (next_latency <= TOTAL_LATENCY)
+            {
+                if (is_rotation[opcode])
+                {
+                    ++rotate_count;
+                }
+
+                // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+                alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+                latency[a] = next_latency;
+
+                // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+                asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+                rotated[a] = is_rotation[opcode];
+
+                inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+                code[code_size].opcode = opcode;
+                code[code_size].dst_index = dst_index;
+                code[code_size].src_index = src_index;
+                code[code_size].C = 0;
+
+                if (src_index == 8)
+                {
+                    r8_used = true;
+                }
+
+                if (opcode == ADD)
+                {
+                    // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+                    alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+                    // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+                    check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+                    uint32_t t;
+                    memcpy(&t, data + data_index, sizeof(uint32_t));
+                    code[code_size].C = SWAP32LE(t);
+                    data_index += sizeof(uint32_t);
+                }
+
+                ++code_size;
+                if (code_size >= NUM_INSTRUCTIONS_MIN)
+                {
+                    break;
+                }
+            }
+            else
+            {
+                ++num_retries;
+            }
+        }
+
+        // ASIC has more execution resources and can extract as much parallelism from the code as possible
+        // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+        // Get this latency for at least 1 of the 4 registers
+        const int prev_code_size = code_size;
+        while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+        {
+            int min_idx = 0;
+            int max_idx = 0;
+            for (int i = 1; i < 4; ++i)
+            {
+                if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+                if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+            }
+
+            const uint8_t pattern[3] = { ROR, MUL, MUL };
+            const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+            latency[min_idx] = latency[max_idx] + op_latency[opcode];
+            asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+            code[code_size].opcode = opcode;
+            code[code_size].dst_index = min_idx;
+            code[code_size].src_index = max_idx;
+            code[code_size].C = 0;
+            ++code_size;
+        }
+
+    // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+    // It never does more than 4 iterations for all block heights < 10,000,000
+    }  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+    // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+    // Add final instruction to stop the interpreter
+    code[code_size].opcode = RET;
+    code[code_size].dst_index = 0;
+    code[code_size].src_index = 0;
+    code[code_size].C = 0;
+
+    return code_size;
+}
+
+#endif
--- a/cmake/asm.cmake
+++ b/cmake/asm.cmake
@@ -0,0 +1,27 @@
+if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+    set(XMRIG_ASM_LIBRARY "xmrig-asm")
+
+    enable_language(ASM)
+
+    if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
+        set(XMRIG_ASM_FILES
+            "crypto/asm/win64/cn_main_loop.S"
+            "crypto/asm/CryptonightR_template.S"
+        )
+    else()
+        set(XMRIG_ASM_FILES
+            "crypto/asm/cn_main_loop.S"
+            "crypto/asm/CryptonightR_template.S"
+        )
+    endif()
+
+    set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C)
+
+    add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES})
+    set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c")
+    set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
+else()
+    set(XMRIG_ASM_SOURCES "")
+    set(XMRIG_ASM_LIBRARY "")
+    add_definitions(/DXMRIG_NO_ASM)
+endif()
--- a/compat/jansson/CMakeLists.txt
+++ b/compat/jansson/CMakeLists.txt
@@ -6,6 +6,8 @@ add_definitions(-DHAVE_CONFIG_H)
 # Add the lib sources.
 file(GLOB JANSSON_SRC *.c)

+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Os")
+
 set(JANSSON_HDR_PRIVATE
   ${CMAKE_CURRENT_SOURCE_DIR}/hashtable.h
   ${CMAKE_CURRENT_SOURCE_DIR}/jansson_private.h
--- a/compat/jansson/dump.c
+++ b/compat/jansson/dump.c
@@ -9,13 +9,17 @@
 #define _GNU_SOURCE
 #endif

+#include "jansson_private.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif

 #include "jansson.h"
-#include "jansson_private.h"
 #include "strbuffer.h"
 #include "utf.h"

@@ -25,11 +29,28 @@
 #define FLAGS_TO_INDENT(f)      ((f) & 0x1F)
 #define FLAGS_TO_PRECISION(f)   (((f) >> 11) & 0x1F)

+struct buffer {
+    const size_t size;
+    size_t used;
+    char *data;
+};
+
 static int dump_to_strbuffer(const char *buffer, size_t size, void *data)
 {
    return strbuffer_append_bytes((strbuffer_t *)data, buffer, size);
 }

+static int dump_to_buffer(const char *buffer, size_t size, void *data)
+{
+    struct buffer *buf = (struct buffer *)data;
+
+    if(buf->used + size <= buf->size)
+        memcpy(&buf->data[buf->used], buffer, size);
+
+    buf->used += size;
+    return 0;
+}
+
 static int dump_to_file(const char *buffer, size_t size, void *data)
 {
    FILE *dest = (FILE *)data;
@@ -38,6 +59,16 @@ static int dump_to_file(const char *buffer, size_t size, void *data)
    return 0;
 }

+static int dump_to_fd(const char *buffer, size_t size, void *data)
+{
+    int *dest = (int *)data;
+#ifdef HAVE_UNISTD_H
+    if(write(*dest, buffer, size) == (ssize_t)size)
+        return 0;
+#endif
+    return -1;
+}
+
 /* 32 spaces (the maximum indentation size) */
 static const char whitespace[] = "                                ";

@@ -168,6 +199,10 @@ static int compare_keys(const void *key1, const void *key2)
 static int do_dump(const json_t *json, size_t flags, int depth,
                   json_dump_callback_t dump, void *data)
 {
+    int embed = flags & JSON_EMBED;
+
+    flags &= ~JSON_EMBED;
+
    if(!json)
        return -1;

@@ -227,11 +262,11 @@ static int do_dump(const json_t *json, size_t flags, int depth,

            n = json_array_size(json);

-            if(dump("[", 1, data))
+            if(!embed && dump("[", 1, data))
                goto array_error;
            if(n == 0) {
                array->visited = 0;
-                return dump("]", 1, data);
+                return embed ? 0 : dump("]", 1, data);
            }
            if(dump_indent(flags, depth + 1, 0, dump, data))
                goto array_error;
@@ -255,7 +290,7 @@ static int do_dump(const json_t *json, size_t flags, int depth,
            }

            array->visited = 0;
-            return dump("]", 1, data);
+            return embed ? 0 : dump("]", 1, data);

        array_error:
            array->visited = 0;
@@ -286,11 +321,11 @@ static int do_dump(const json_t *json, size_t flags, int depth,

            iter = json_object_iter((json_t *)json);

-            if(dump("{", 1, data))
+            if(!embed && dump("{", 1, data))
                goto object_error;
            if(!iter) {
                object->visited = 0;
-                return dump("}", 1, data);
+                return embed ? 0 : dump("}", 1, data);
            }
            if(dump_indent(flags, depth + 1, 0, dump, data))
                goto object_error;
@@ -386,7 +421,7 @@ static int do_dump(const json_t *json, size_t flags, int depth,
            }

            object->visited = 0;
-            return dump("}", 1, data);
+            return embed ? 0 : dump("}", 1, data);

        object_error:
            object->visited = 0;
@@ -416,11 +451,26 @@ char *json_dumps(const json_t *json, size_t flags)
    return result;
 }

+size_t json_dumpb(const json_t *json, char *buffer, size_t size, size_t flags)
+{
+    struct buffer buf = { size, 0, buffer };
+
+    if(json_dump_callback(json, dump_to_buffer, (void *)&buf, flags))
+        return 0;
+
+    return buf.used;
+}
+
 int json_dumpf(const json_t *json, FILE *output, size_t flags)
 {
    return json_dump_callback(json, dump_to_file, (void *)output, flags);
 }

+int json_dumpfd(const json_t *json, int output, size_t flags)
+{
+    return json_dump_callback(json, dump_to_fd, (void *)&output, flags);
+}
+
 int json_dump_file(const json_t *json, const char *path, size_t flags)
 {
    int result;
--- a/compat/jansson/hashtable_seed.c
+++ b/compat/jansson/hashtable_seed.c
@@ -168,12 +168,12 @@ static uint32_t generate_seed() {
    int done = 0;

 #if !defined(_WIN32) && defined(USE_URANDOM)
-    if (!done && seed_from_urandom(&seed) == 0)
+    if (seed_from_urandom(&seed) == 0)
        done = 1;
 #endif

 #if defined(_WIN32) && defined(USE_WINDOWS_CRYPTOAPI)
-    if (!done && seed_from_windows_cryptoapi(&seed) == 0)
+    if (seed_from_windows_cryptoapi(&seed) == 0)
        done = 1;
 #endif

--- a/compat/jansson/jansson.h
+++ b/compat/jansson/jansson.h
@@ -21,11 +21,11 @@ extern "C" {
 /* version */

 #define JANSSON_MAJOR_VERSION  2
-#define JANSSON_MINOR_VERSION  9
+#define JANSSON_MINOR_VERSION  10
 #define JANSSON_MICRO_VERSION  0

 /* Micro version is omitted if it's 0 */
-#define JANSSON_VERSION  "2.9"
+#define JANSSON_VERSION  "2.10"

 /* Version as a 3-byte hex number, e.g. 0x010201 == 1.2.1. Use this
   for numeric comparisons, e.g. #if JANSSON_VERSION_HEX >= ... */
@@ -273,6 +273,7 @@ typedef size_t (*json_load_callback_t)(void *buffer, size_t buflen, void *data);
 json_t *json_loads(const char *input, size_t flags, json_error_t *error);
 json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error);
 json_t *json_loadf(FILE *input, size_t flags, json_error_t *error);
+json_t *json_loadfd(int input, size_t flags, json_error_t *error);
 json_t *json_load_file(const char *path, size_t flags, json_error_t *error);
 json_t *json_load_callback(json_load_callback_t callback, void *data, size_t flags, json_error_t *error);

@@ -288,11 +289,14 @@ json_t *json_load_callback(json_load_callback_t callback, void *data, size_t fla
 #define JSON_ENCODE_ANY         0x200
 #define JSON_ESCAPE_SLASH       0x400
 #define JSON_REAL_PRECISION(n)  (((n) & 0x1F) << 11)
+#define JSON_EMBED              0x10000

 typedef int (*json_dump_callback_t)(const char *buffer, size_t size, void *data);

 char *json_dumps(const json_t *json, size_t flags);
+size_t json_dumpb(const json_t *json, char *buffer, size_t size, size_t flags);
 int json_dumpf(const json_t *json, FILE *output, size_t flags);
+int json_dumpfd(const json_t *json, int output, size_t flags);
 int json_dump_file(const json_t *json, const char *path, size_t flags);
 int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags);

--- a/compat/jansson/jansson_private.h
+++ b/compat/jansson/jansson_private.h
@@ -8,6 +8,7 @@
 #ifndef JANSSON_PRIVATE_H
 #define JANSSON_PRIVATE_H

+#include "jansson_private_config.h"
 #include <stddef.h>
 #include "jansson.h"
 #include "hashtable.h"
--- a/compat/jansson/load.c
+++ b/compat/jansson/load.c
@@ -9,15 +9,19 @@
 #define _GNU_SOURCE
 #endif

+#include "jansson_private.h"
+
 #include <errno.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif

 #include "jansson.h"
-#include "jansson_private.h"
 #include "strbuffer.h"
 #include "utf.h"

@@ -340,7 +344,7 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
            /* control character */
            lex_unget_unsave(lex, c);
            if(c == '\n')
-                error_set(error, lex, "unexpected newline", c);
+                error_set(error, lex, "unexpected newline");
            else
                error_set(error, lex, "control character 0x%x", c);
            goto out;
@@ -914,7 +918,7 @@ static json_t *parse_json(lex_t *lex, size_t flags, json_error_t *error)
 typedef struct
 {
    const char *data;
-    int pos;
+    size_t pos;
 } string_data_t;

 static int string_get(void *data)
@@ -1028,6 +1032,45 @@ json_t *json_loadf(FILE *input, size_t flags, json_error_t *error)
    return result;
 }

+static int fd_get_func(int *fd)
+{
+    uint8_t c;
+#ifdef HAVE_UNISTD_H
+    if (read(*fd, &c, 1) == 1)
+        return c;
+#endif
+    return EOF;
+}
+
+json_t *json_loadfd(int input, size_t flags, json_error_t *error)
+{
+    lex_t lex;
+    const char *source;
+    json_t *result;
+
+#ifdef HAVE_UNISTD_H
+    if(input == STDIN_FILENO)
+        source = "<stdin>";
+    else
+#endif
+        source = "<stream>";
+
+    jsonp_error_init(error, source);
+
+    if (input < 0) {
+        error_set(error, NULL, "wrong arguments");
+        return NULL;
+    }
+
+    if(lex_init(&lex, (get_func)fd_get_func, flags, &input))
+        return NULL;
+
+    result = parse_json(&lex, flags, error);
+
+    lex_close(&lex);
+    return result;
+}
+
 json_t *json_load_file(const char *path, size_t flags, json_error_t *error)
 {
    json_t *result;
--- a/compat/libcpuid/CMakeLists.txt
+++ b/compat/libcpuid/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required (VERSION 2.8)
+project (cpuid C)
+
+add_definitions(/DVERSION="0.4.0")
+
+set(HEADERS
+    libcpuid.h
+    libcpuid_types.h
+    libcpuid_constants.h
+    libcpuid_internal.h
+amd_code_t.h
+intel_code_t.h
+recog_amd.h
+recog_intel.h
+asm-bits.h
+libcpuid_util.h
+    )
+
+set(SOURCES
+    cpuid_main.c
+    asm-bits.c
+    recog_amd.c
+    recog_intel.c
+    libcpuid_util.c
+   )
+
+add_library(cpuid STATIC
+    ${HEADERS}
+    ${SOURCES}
+    )
--- a/compat/libcpuid/amd_code_t.h
+++ b/compat/libcpuid/amd_code_t.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2016  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file contains a list of internal codes we use in detection. It is
+ * of no external use and isn't a complete list of AMD products.
+ */
+	CODE2(OPTERON_800, 1000),
+	CODE(PHENOM),
+	CODE(PHENOM2),
+	CODE(FUSION_C),
+	CODE(FUSION_E),
+	CODE(FUSION_EA),
+	CODE(FUSION_Z),
+	CODE(FUSION_A),
+	
--- a/compat/libcpuid/asm-bits.c
+++ b/compat/libcpuid/asm-bits.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libcpuid.h"
+#include "asm-bits.h"
+
+int cpuid_exists_by_eflags(void)
+{
+#if defined(PLATFORM_X64)
+	return 1; /* CPUID is always present on the x86_64 */
+#elif defined(PLATFORM_X86)
+#  if defined(COMPILER_GCC)
+	int result;
+	__asm __volatile(
+		"	pushfl\n"
+		"	pop	%%eax\n"
+		"	mov	%%eax,	%%ecx\n"
+		"	xor	$0x200000,	%%eax\n"
+		"	push	%%eax\n"
+		"	popfl\n"
+		"	pushfl\n"
+		"	pop	%%eax\n"
+		"	xor	%%ecx,	%%eax\n"
+		"	mov	%%eax,	%0\n"
+		"	push	%%ecx\n"
+		"	popfl\n"
+		: "=m"(result)
+		: :"eax", "ecx", "memory");
+	return (result != 0);
+#  elif defined(COMPILER_MICROSOFT)
+	int result;
+	__asm {
+		pushfd
+		pop	eax
+		mov	ecx,	eax
+		xor	eax,	0x200000
+		push	eax
+		popfd
+		pushfd
+		pop	eax
+		xor	eax,	ecx
+		mov	result,	eax
+		push	ecx
+		popfd
+	};
+	return (result != 0);
+#  else
+	return 0;
+#  endif /* COMPILER_MICROSOFT */
+#else
+	return 0;
+#endif /* PLATFORM_X86 */
+}
+
+#ifdef INLINE_ASM_SUPPORTED
+/* 
+ * with MSVC/AMD64, the exec_cpuid() and cpu_rdtsc() functions
+ * are implemented in separate .asm files. Otherwise, use inline assembly
+ */
+void exec_cpuid(uint32_t *regs)
+{
+#ifdef COMPILER_GCC
+#	ifdef PLATFORM_X64
+	__asm __volatile(
+		"	mov	%0,	%%rdi\n"
+
+		"	push	%%rbx\n"
+		"	push	%%rcx\n"
+		"	push	%%rdx\n"
+		
+		"	mov	(%%rdi),	%%eax\n"
+		"	mov	4(%%rdi),	%%ebx\n"
+		"	mov	8(%%rdi),	%%ecx\n"
+		"	mov	12(%%rdi),	%%edx\n"
+		
+		"	cpuid\n"
+		
+		"	movl	%%eax,	(%%rdi)\n"
+		"	movl	%%ebx,	4(%%rdi)\n"
+		"	movl	%%ecx,	8(%%rdi)\n"
+		"	movl	%%edx,	12(%%rdi)\n"
+		"	pop	%%rdx\n"
+		"	pop	%%rcx\n"
+		"	pop	%%rbx\n"
+		:
+		:"m"(regs)
+		:"memory", "eax", "rdi"
+	);
+#	else
+	__asm __volatile(
+		"	mov	%0,	%%edi\n"
+
+		"	push	%%ebx\n"
+		"	push	%%ecx\n"
+		"	push	%%edx\n"
+		
+		"	mov	(%%edi),	%%eax\n"
+		"	mov	4(%%edi),	%%ebx\n"
+		"	mov	8(%%edi),	%%ecx\n"
+		"	mov	12(%%edi),	%%edx\n"
+		
+		"	cpuid\n"
+		
+		"	mov	%%eax,	(%%edi)\n"
+		"	mov	%%ebx,	4(%%edi)\n"
+		"	mov	%%ecx,	8(%%edi)\n"
+		"	mov	%%edx,	12(%%edi)\n"
+		"	pop	%%edx\n"
+		"	pop	%%ecx\n"
+		"	pop	%%ebx\n"
+		:
+		:"m"(regs)
+		:"memory", "eax", "edi"
+	);
+#	endif /* COMPILER_GCC */
+#else
+#  ifdef COMPILER_MICROSOFT
+	__asm {
+		push	ebx
+		push	ecx
+		push	edx
+		push	edi
+		mov	edi,	regs
+		
+		mov	eax,	[edi]
+		mov	ebx,	[edi+4]
+		mov	ecx,	[edi+8]
+		mov	edx,	[edi+12]
+		
+		cpuid
+		
+		mov	[edi],		eax
+		mov	[edi+4],	ebx
+		mov	[edi+8],	ecx
+		mov	[edi+12],	edx
+		
+		pop	edi
+		pop	edx
+		pop	ecx
+		pop	ebx
+	}
+#  else
+#    error "Unsupported compiler"
+#  endif /* COMPILER_MICROSOFT */
+#endif
+}
+#endif /* INLINE_ASSEMBLY_SUPPORTED */
+
+#ifdef INLINE_ASM_SUPPORTED
+void cpu_rdtsc(uint64_t* result)
+{
+	uint32_t low_part, hi_part;
+#ifdef COMPILER_GCC
+	__asm __volatile (
+		"	rdtsc\n"
+		"	mov	%%eax,	%0\n"
+		"	mov	%%edx,	%1\n"
+		:"=m"(low_part), "=m"(hi_part)::"memory", "eax", "edx"
+	);
+#else
+#  ifdef COMPILER_MICROSOFT
+	__asm {
+		rdtsc
+		mov	low_part,	eax
+		mov	hi_part,	edx
+	};
+#  else
+#    error "Unsupported compiler"
+#  endif /* COMPILER_MICROSOFT */
+#endif /* COMPILER_GCC */
+	*result = (uint64_t)low_part + (((uint64_t) hi_part) << 32);
+}
+#endif /* INLINE_ASM_SUPPORTED */
+
+#ifdef INLINE_ASM_SUPPORTED
+void busy_sse_loop(int cycles)
+{
+#ifdef COMPILER_GCC
+#ifndef __APPLE__
+#	define XALIGN ".balign 16\n"
+#else
+#	define XALIGN ".align 4\n"
+#endif
+	__asm __volatile (
+		"	xorps	%%xmm0,	%%xmm0\n"
+		"	xorps	%%xmm1,	%%xmm1\n"
+		"	xorps	%%xmm2,	%%xmm2\n"
+		"	xorps	%%xmm3,	%%xmm3\n"
+		"	xorps	%%xmm4,	%%xmm4\n"
+		"	xorps	%%xmm5,	%%xmm5\n"
+		"	xorps	%%xmm6,	%%xmm6\n"
+		"	xorps	%%xmm7,	%%xmm7\n"
+		XALIGN
+		/* ".bsLoop:\n" */
+		"1:\n"
+		// 0:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 1:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 2:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 3:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 4:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 5:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 6:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 7:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 8:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		// 9:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//10:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//11:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//12:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//13:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//14:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//15:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//16:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//17:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//18:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//19:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//20:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//21:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//22:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//23:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//24:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//25:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//26:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//27:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//28:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//29:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//30:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		//31:
+		"	addps	%%xmm1, %%xmm0\n"
+		"	addps	%%xmm2, %%xmm1\n"
+		"	addps	%%xmm3, %%xmm2\n"
+		"	addps	%%xmm4, %%xmm3\n"
+		"	addps	%%xmm5, %%xmm4\n"
+		"	addps	%%xmm6, %%xmm5\n"
+		"	addps	%%xmm7, %%xmm6\n"
+		"	addps	%%xmm0, %%xmm7\n"
+		
+		"	dec	%%eax\n"
+		/* "jnz	.bsLoop\n" */
+		"	jnz	1b\n"
+		::"a"(cycles)
+	);
+#else
+#  ifdef COMPILER_MICROSOFT
+	__asm {
+		mov	eax,	cycles
+		xorps	xmm0,	xmm0
+		xorps	xmm1,	xmm1
+		xorps	xmm2,	xmm2
+		xorps	xmm3,	xmm3
+		xorps	xmm4,	xmm4
+		xorps	xmm5,	xmm5
+		xorps	xmm6,	xmm6
+		xorps	xmm7,	xmm7
+		//--
+		align 16
+bsLoop:
+		// 0:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 1:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 2:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 3:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 4:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 5:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 6:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 7:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 8:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 9:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 10:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 11:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 12:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 13:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 14:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 15:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 16:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 17:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 18:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 19:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 20:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 21:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 22:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 23:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 24:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 25:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 26:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 27:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 28:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 29:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 30:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		// 31:
+		addps	xmm0,	xmm1
+		addps	xmm1,	xmm2
+		addps	xmm2,	xmm3
+		addps	xmm3,	xmm4
+		addps	xmm4,	xmm5
+		addps	xmm5,	xmm6
+		addps	xmm6,	xmm7
+		addps	xmm7,	xmm0
+		//----------------------
+		dec		eax
+		jnz		bsLoop
+	}
+#  else
+#    error "Unsupported compiler"
+#  endif /* COMPILER_MICROSOFT */
+#endif /* COMPILER_GCC */
+}
+#endif /* INLINE_ASSEMBLY_SUPPORTED */
--- a/compat/libcpuid/asm-bits.h
+++ b/compat/libcpuid/asm-bits.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __ASM_BITS_H__
+#define __ASM_BITS_H__
+#include "libcpuid.h"
+
+/* Determine Compiler: */
+#if defined(_MSC_VER)
+#	define COMPILER_MICROSOFT
+#elif defined(__GNUC__)
+#	define COMPILER_GCC
+#endif
+
+/* Determine Platform */
+#if defined(__x86_64__) || defined(_M_AMD64)
+#	define PLATFORM_X64
+#elif defined(__i386__) || defined(_M_IX86)
+#	define PLATFORM_X86
+#endif
+
+/* Under Windows/AMD64 with MSVC, inline assembly isn't supported */
+#if (defined(COMPILER_GCC) && defined(PLATFORM_X64)) || defined(PLATFORM_X86)
+#	define INLINE_ASM_SUPPORTED
+#endif
+
+int cpuid_exists_by_eflags(void);
+void exec_cpuid(uint32_t *regs);
+void busy_sse_loop(int cycles);
+
+#endif /* __ASM_BITS_H__ */
--- a/compat/libcpuid/cpuid_main.c
+++ b/compat/libcpuid/cpuid_main.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "libcpuid.h"
+#include "libcpuid_internal.h"
+#include "recog_intel.h"
+#include "recog_amd.h"
+#include "asm-bits.h"
+#include "libcpuid_util.h"
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Implementation: */
+
+static int _libcpiud_errno = ERR_OK;
+
+int set_error(cpu_error_t err)
+{
+	_libcpiud_errno = (int) err;
+	return (int) err;
+}
+
+static void raw_data_t_constructor(struct cpu_raw_data_t* raw)
+{
+	memset(raw, 0, sizeof(struct cpu_raw_data_t));
+}
+
+static void cpu_id_t_constructor(struct cpu_id_t* id)
+{
+	memset(id, 0, sizeof(struct cpu_id_t));
+	id->l1_data_cache = id->l1_instruction_cache = id->l2_cache = id->l3_cache = id->l4_cache = -1;
+	id->l1_assoc = id->l2_assoc = id->l3_assoc = id->l4_assoc = -1;
+	id->l1_cacheline = id->l2_cacheline = id->l3_cacheline = id->l4_cacheline = -1;
+	id->sse_size = -1;
+}
+
+static int parse_token(const char* expected_token, const char *token,
+                        const char *value, uint32_t array[][4], int limit, int *recognized)
+{
+	char format[32];
+	int veax, vebx, vecx, vedx;
+	int index;
+
+	if (*recognized) return 1; /* already recognized */
+	if (strncmp(token, expected_token, strlen(expected_token))) return 1; /* not what we search for */
+	sprintf(format, "%s[%%d]", expected_token);
+	*recognized = 1;
+	if (1 == sscanf(token, format, &index) && index >=0 && index < limit) {
+		if (4 == sscanf(value, "%x%x%x%x", &veax, &vebx, &vecx, &vedx)) {
+			array[index][0] = veax;
+			array[index][1] = vebx;
+			array[index][2] = vecx;
+			array[index][3] = vedx;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/* get_total_cpus() system specific code: uses OS routines to determine total number of CPUs */
+#ifdef __APPLE__
+#include <unistd.h>
+#include <mach/clock_types.h>
+#include <mach/clock.h>
+#include <mach/mach.h>
+static int get_total_cpus(void)
+{
+	kern_return_t kr;
+	host_basic_info_data_t basic_info;
+	host_info_t info = (host_info_t)&basic_info;
+	host_flavor_t flavor = HOST_BASIC_INFO;
+	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+	kr = host_info(mach_host_self(), flavor, info, &count);
+	if (kr != KERN_SUCCESS) return 1;
+	return basic_info.avail_cpus;
+}
+#define GET_TOTAL_CPUS_DEFINED
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+static int get_total_cpus(void)
+{
+	SYSTEM_INFO system_info;
+	GetSystemInfo(&system_info);
+	return system_info.dwNumberOfProcessors;
+}
+#define GET_TOTAL_CPUS_DEFINED
+#endif
+
+#if defined linux || defined __linux__ || defined __sun
+#include <sys/sysinfo.h>
+#include <unistd.h>
+
+static int get_total_cpus(void)
+{
+	return sysconf(_SC_NPROCESSORS_ONLN);
+}
+#define GET_TOTAL_CPUS_DEFINED
+#endif
+
+#if defined __FreeBSD__ || defined __OpenBSD__ || defined __NetBSD__ || defined __bsdi__ || defined __QNX__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static int get_total_cpus(void)
+{
+	int mib[2] = { CTL_HW, HW_NCPU };
+	int ncpus;
+	size_t len = sizeof(ncpus);
+	if (sysctl(mib, 2, &ncpus, &len, (void *) 0, 0) != 0) return 1;
+	return ncpus;
+}
+#define GET_TOTAL_CPUS_DEFINED
+#endif
+
+#ifndef GET_TOTAL_CPUS_DEFINED
+static int get_total_cpus(void)
+{
+	static int warning_printed = 0;
+	if (!warning_printed) {
+		warning_printed = 1;
+		warnf("Your system is not supported by libcpuid -- don't know how to detect the\n");
+		warnf("total number of CPUs on your system. It will be reported as 1.\n");
+		printf("Please use cpu_id_t.logical_cpus field instead.\n");
+	}
+	return 1;
+}
+#endif /* GET_TOTAL_CPUS_DEFINED */
+
+
+static void load_features_common(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	const struct feature_map_t matchtable_edx1[] = {
+		{  0, CPU_FEATURE_FPU },
+		{  1, CPU_FEATURE_VME },
+		{  2, CPU_FEATURE_DE },
+		{  3, CPU_FEATURE_PSE },
+		{  4, CPU_FEATURE_TSC },
+		{  5, CPU_FEATURE_MSR },
+		{  6, CPU_FEATURE_PAE },
+		{  7, CPU_FEATURE_MCE },
+		{  8, CPU_FEATURE_CX8 },
+		{  9, CPU_FEATURE_APIC },
+		{ 11, CPU_FEATURE_SEP },
+		{ 12, CPU_FEATURE_MTRR },
+		{ 13, CPU_FEATURE_PGE },
+		{ 14, CPU_FEATURE_MCA },
+		{ 15, CPU_FEATURE_CMOV },
+		{ 16, CPU_FEATURE_PAT },
+		{ 17, CPU_FEATURE_PSE36 },
+		{ 19, CPU_FEATURE_CLFLUSH },
+		{ 23, CPU_FEATURE_MMX },
+		{ 24, CPU_FEATURE_FXSR },
+		{ 25, CPU_FEATURE_SSE },
+		{ 26, CPU_FEATURE_SSE2 },
+		{ 28, CPU_FEATURE_HT },
+	};
+	const struct feature_map_t matchtable_ecx1[] = {
+		{  0, CPU_FEATURE_PNI },
+		{  1, CPU_FEATURE_PCLMUL },
+		{  3, CPU_FEATURE_MONITOR },
+		{  9, CPU_FEATURE_SSSE3 },
+		{ 12, CPU_FEATURE_FMA3 },
+		{ 13, CPU_FEATURE_CX16 },
+		{ 19, CPU_FEATURE_SSE4_1 },
+		{ 20, CPU_FEATURE_SSE4_2 },
+		{ 22, CPU_FEATURE_MOVBE },
+		{ 23, CPU_FEATURE_POPCNT },
+		{ 25, CPU_FEATURE_AES },
+		{ 26, CPU_FEATURE_XSAVE },
+		{ 27, CPU_FEATURE_OSXSAVE },
+		{ 28, CPU_FEATURE_AVX },
+		{ 29, CPU_FEATURE_F16C },
+		{ 30, CPU_FEATURE_RDRAND },
+	};
+	const struct feature_map_t matchtable_ebx7[] = {
+		{  3, CPU_FEATURE_BMI1 },
+		{  5, CPU_FEATURE_AVX2 },
+		{  8, CPU_FEATURE_BMI2 },
+	};
+	const struct feature_map_t matchtable_edx81[] = {
+		{ 11, CPU_FEATURE_SYSCALL },
+		{ 27, CPU_FEATURE_RDTSCP },
+		{ 29, CPU_FEATURE_LM },
+	};
+	const struct feature_map_t matchtable_ecx81[] = {
+		{  0, CPU_FEATURE_LAHF_LM },
+	};
+	const struct feature_map_t matchtable_edx87[] = {
+		{  8, CPU_FEATURE_CONSTANT_TSC },
+	};
+	if (raw->basic_cpuid[0][0] >= 1) {
+		match_features(matchtable_edx1, COUNT_OF(matchtable_edx1), raw->basic_cpuid[1][3], data);
+		match_features(matchtable_ecx1, COUNT_OF(matchtable_ecx1), raw->basic_cpuid[1][2], data);
+	}
+	if (raw->basic_cpuid[0][0] >= 7) {
+		match_features(matchtable_ebx7, COUNT_OF(matchtable_ebx7), raw->basic_cpuid[7][1], data);
+	}
+	if (raw->ext_cpuid[0][0] >= 0x80000001) {
+		match_features(matchtable_edx81, COUNT_OF(matchtable_edx81), raw->ext_cpuid[1][3], data);
+		match_features(matchtable_ecx81, COUNT_OF(matchtable_ecx81), raw->ext_cpuid[1][2], data);
+	}
+	if (raw->ext_cpuid[0][0] >= 0x80000007) {
+		match_features(matchtable_edx87, COUNT_OF(matchtable_edx87), raw->ext_cpuid[7][3], data);
+	}
+	if (data->flags[CPU_FEATURE_SSE]) {
+		/* apply guesswork to check if the SSE unit width is 128 bit */
+		switch (data->vendor) {
+			case VENDOR_AMD:
+				data->sse_size = (data->ext_family >= 16 && data->ext_family != 17) ? 128 : 64;
+				break;
+			case VENDOR_INTEL:
+				data->sse_size = (data->family == 6 && data->ext_model >= 15) ? 128 : 64;
+				break;
+			default:
+				break;
+		}
+		/* leave the CPU_FEATURE_128BIT_SSE_AUTH 0; the advanced per-vendor detection routines
+		 * will set it accordingly if they detect the needed bit */
+	}
+}
+
+static cpu_vendor_t cpuid_vendor_identify(const uint32_t *raw_vendor, char *vendor_str)
+{
+	int i;
+	cpu_vendor_t vendor = VENDOR_UNKNOWN;
+	const struct { cpu_vendor_t vendor; char match[16]; }
+	matchtable[NUM_CPU_VENDORS] = {
+		/* source: http://www.sandpile.org/ia32/cpuid.htm */
+		{ VENDOR_INTEL		, "GenuineIntel" },
+		{ VENDOR_AMD		, "AuthenticAMD" },
+		{ VENDOR_CYRIX		, "CyrixInstead" },
+		{ VENDOR_NEXGEN		, "NexGenDriven" },
+		{ VENDOR_TRANSMETA	, "GenuineTMx86" },
+		{ VENDOR_UMC		, "UMC UMC UMC " },
+		{ VENDOR_CENTAUR	, "CentaurHauls" },
+		{ VENDOR_RISE		, "RiseRiseRise" },
+		{ VENDOR_SIS		, "SiS SiS SiS " },
+		{ VENDOR_NSC		, "Geode by NSC" },
+	};
+
+	memcpy(vendor_str + 0, &raw_vendor[1], 4);
+	memcpy(vendor_str + 4, &raw_vendor[3], 4);
+	memcpy(vendor_str + 8, &raw_vendor[2], 4);
+	vendor_str[12] = 0;
+
+	/* Determine vendor: */
+	for (i = 0; i < NUM_CPU_VENDORS; i++)
+		if (!strcmp(vendor_str, matchtable[i].match)) {
+			vendor = matchtable[i].vendor;
+			break;
+		}
+	return vendor;
+}
+
+static int cpuid_basic_identify(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	int i, j, basic, xmodel, xfamily, ext;
+	char brandstr[64] = {0};
+	data->vendor = cpuid_vendor_identify(raw->basic_cpuid[0], data->vendor_str);
+
+	if (data->vendor == VENDOR_UNKNOWN)
+		return set_error(ERR_CPU_UNKN);
+	basic = raw->basic_cpuid[0][0];
+	if (basic >= 1) {
+		data->family = (raw->basic_cpuid[1][0] >> 8) & 0xf;
+		data->model = (raw->basic_cpuid[1][0] >> 4) & 0xf;
+		data->stepping = raw->basic_cpuid[1][0] & 0xf;
+		xmodel = (raw->basic_cpuid[1][0] >> 16) & 0xf;
+		xfamily = (raw->basic_cpuid[1][0] >> 20) & 0xff;
+		if (data->vendor == VENDOR_AMD && data->family < 0xf)
+			data->ext_family = data->family;
+		else
+			data->ext_family = data->family + xfamily;
+		data->ext_model = data->model + (xmodel << 4);
+	}
+	ext = raw->ext_cpuid[0][0] - 0x8000000;
+
+	/* obtain the brand string, if present: */
+	if (ext >= 4) {
+		for (i = 0; i < 3; i++)
+			for (j = 0; j < 4; j++)
+				memcpy(brandstr + i * 16 + j * 4,
+				       &raw->ext_cpuid[2 + i][j], 4);
+		brandstr[48] = 0;
+		i = 0;
+		while (brandstr[i] == ' ') i++;
+		strncpy(data->brand_str, brandstr + i, sizeof(data->brand_str));
+		data->brand_str[48] = 0;
+	}
+	load_features_common(raw, data);
+	data->total_logical_cpus = get_total_cpus();
+	return set_error(ERR_OK);
+}
+
+static void make_list_from_string(const char* csv, struct cpu_list_t* list)
+{
+	int i, n, l, last;
+	l = (int) strlen(csv);
+	n = 0;
+	for (i = 0; i < l; i++) if (csv[i] == ',') n++;
+	n++;
+	list->num_entries = n;
+	list->names = (char**) malloc(sizeof(char*) * n);
+	last = -1;
+	n = 0;
+	for (i = 0; i <= l; i++) if (i == l || csv[i] == ',') {
+		list->names[n] = (char*) malloc(i - last);
+		memcpy(list->names[n], &csv[last + 1], i - last - 1);
+		list->names[n][i - last - 1] = '\0';
+		n++;
+		last = i;
+	}
+}
+
+
+/* Interface: */
+
+int cpuid_get_total_cpus(void)
+{
+	return get_total_cpus();
+}
+
+int cpuid_present(void)
+{
+	return cpuid_exists_by_eflags();
+}
+
+void cpu_exec_cpuid(uint32_t eax, uint32_t* regs)
+{
+	regs[0] = eax;
+	regs[1] = regs[2] = regs[3] = 0;
+	exec_cpuid(regs);
+}
+
+void cpu_exec_cpuid_ext(uint32_t* regs)
+{
+	exec_cpuid(regs);
+}
+
+int cpuid_get_raw_data(struct cpu_raw_data_t* data)
+{
+	unsigned i;
+	if (!cpuid_present())
+		return set_error(ERR_NO_CPUID);
+	for (i = 0; i < 32; i++)
+		cpu_exec_cpuid(i, data->basic_cpuid[i]);
+	for (i = 0; i < 32; i++)
+		cpu_exec_cpuid(0x80000000 + i, data->ext_cpuid[i]);
+	for (i = 0; i < MAX_INTELFN4_LEVEL; i++) {
+		memset(data->intel_fn4[i], 0, sizeof(data->intel_fn4[i]));
+		data->intel_fn4[i][0] = 4;
+		data->intel_fn4[i][2] = i;
+		cpu_exec_cpuid_ext(data->intel_fn4[i]);
+	}
+	for (i = 0; i < MAX_INTELFN11_LEVEL; i++) {
+		memset(data->intel_fn11[i], 0, sizeof(data->intel_fn11[i]));
+		data->intel_fn11[i][0] = 11;
+		data->intel_fn11[i][2] = i;
+		cpu_exec_cpuid_ext(data->intel_fn11[i]);
+	}
+	for (i = 0; i < MAX_INTELFN12H_LEVEL; i++) {
+		memset(data->intel_fn12h[i], 0, sizeof(data->intel_fn12h[i]));
+		data->intel_fn12h[i][0] = 0x12;
+		data->intel_fn12h[i][2] = i;
+		cpu_exec_cpuid_ext(data->intel_fn12h[i]);
+	}
+	for (i = 0; i < MAX_INTELFN14H_LEVEL; i++) {
+		memset(data->intel_fn14h[i], 0, sizeof(data->intel_fn14h[i]));
+		data->intel_fn14h[i][0] = 0x14;
+		data->intel_fn14h[i][2] = i;
+		cpu_exec_cpuid_ext(data->intel_fn14h[i]);
+	}
+	return set_error(ERR_OK);
+}
+
+int cpu_ident_internal(struct cpu_raw_data_t* raw, struct cpu_id_t* data, struct internal_id_info_t* internal)
+{
+	int r;
+	struct cpu_raw_data_t myraw;
+	if (!raw) {
+		if ((r = cpuid_get_raw_data(&myraw)) < 0)
+			return set_error(r);
+		raw = &myraw;
+	}
+	cpu_id_t_constructor(data);
+	if ((r = cpuid_basic_identify(raw, data)) < 0)
+		return set_error(r);
+	switch (data->vendor) {
+		case VENDOR_INTEL:
+			r = cpuid_identify_intel(raw, data, internal);
+			break;
+		case VENDOR_AMD:
+			r = cpuid_identify_amd(raw, data, internal);
+			break;
+		default:
+			break;
+	}
+	return set_error(r);
+}
+
+int cpu_identify(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	struct internal_id_info_t throwaway;
+	return cpu_ident_internal(raw, data, &throwaway);
+}
+
+const char* cpuid_lib_version(void)
+{
+	return VERSION;
+}
--- a/compat/libcpuid/intel_code_t.h
+++ b/compat/libcpuid/intel_code_t.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2016  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file contains a list of internal codes we use in detection. It is
+ * of no external use and isn't a complete list of intel products.
+ */
+	CODE2(PENTIUM, 2000),
+	
+	CODE(IRWIN),
+	CODE(POTOMAC),
+	CODE(GAINESTOWN),
+	CODE(WESTMERE),
+	
+	CODE(PENTIUM_M),
+	CODE(NOT_CELERON),	
+	
+	CODE(CORE_SOLO),
+	CODE(MOBILE_CORE_SOLO),
+	CODE(CORE_DUO),
+	CODE(MOBILE_CORE_DUO),
+	
+	CODE(WOLFDALE),
+	CODE(MEROM),
+	CODE(PENRYN),
+	CODE(QUAD_CORE),
+	CODE(DUAL_CORE_HT),
+	CODE(QUAD_CORE_HT),
+	CODE(MORE_THAN_QUADCORE),
+	CODE(PENTIUM_D),
+	
+	CODE(SILVERTHORNE),
+	CODE(DIAMONDVILLE),
+	CODE(PINEVIEW),
+	CODE(CEDARVIEW),
--- a/compat/libcpuid/libcpuid.h
+++ b/compat/libcpuid/libcpuid.h
--- a/compat/libcpuid/libcpuid_constants.h
+++ b/compat/libcpuid/libcpuid_constants.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @File     libcpuid_constants.h
+ * @Author   Veselin Georgiev
+ * @Brief    Some limits and constants for libcpuid
+ */
+
+#ifndef __LIBCPUID_CONSTANTS_H__
+#define __LIBCPUID_CONSTANTS_H__
+
+#define VENDOR_STR_MAX		16
+#define BRAND_STR_MAX		64
+#define CPU_FLAGS_MAX		128
+#define MAX_CPUID_LEVEL		32
+#define MAX_EXT_CPUID_LEVEL	32
+#define MAX_INTELFN4_LEVEL	8
+#define MAX_INTELFN11_LEVEL	4
+#define MAX_INTELFN12H_LEVEL	4
+#define MAX_INTELFN14H_LEVEL	4
+#define CPU_HINTS_MAX		16
+#define SGX_FLAGS_MAX		14
+
+#endif /* __LIBCPUID_CONSTANTS_H__ */
--- a/compat/libcpuid/libcpuid_internal.h
+++ b/compat/libcpuid/libcpuid_internal.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2016  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __LIBCPUID_INTERNAL_H__
+#define __LIBCPUID_INTERNAL_H__
+/*
+ * This file contains internal undocumented declarations and function prototypes
+ * for the workings of the internal library infrastructure.
+ */
+
+enum _common_codes_t {
+	NA = 0,
+	NC, /* No code */
+};
+
+#define CODE(x) x
+#define CODE2(x, y) x = y
+enum _amd_code_t {
+	#include "amd_code_t.h"
+};
+typedef enum _amd_code_t amd_code_t;
+
+enum _intel_code_t {
+	#include "intel_code_t.h"
+};
+typedef enum _intel_code_t intel_code_t;
+#undef CODE
+#undef CODE2
+
+struct internal_id_info_t {
+	union {
+		amd_code_t   amd;
+		intel_code_t intel;
+	} code;
+	uint64_t bits;
+	int score; // detection (matchtable) score
+};
+
+int cpu_ident_internal(struct cpu_raw_data_t* raw, struct cpu_id_t* data, 
+		       struct internal_id_info_t* internal);
+
+#endif /* __LIBCPUID_INTERNAL_H__ */
--- a/compat/libcpuid/libcpuid_types.h
+++ b/compat/libcpuid/libcpuid_types.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @File     libcpuid_types.h
+ * @Author   Veselin Georgiev
+ * @Brief    Type specifications for libcpuid.
+ */
+
+#ifndef __LIBCPUID_TYPES_H__
+#define __LIBCPUID_TYPES_H__
+
+#include <stdint.h>
+
+#endif /* __LIBCPUID_TYPES_H__ */
--- a/compat/libcpuid/libcpuid_util.c
+++ b/compat/libcpuid/libcpuid_util.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include "libcpuid.h"
+#include "libcpuid_util.h"
+
+int _current_verboselevel;
+
+void match_features(const struct feature_map_t* matchtable, int count, uint32_t reg, struct cpu_id_t* data)
+{
+	int i;
+	for (i = 0; i < count; i++)
+		if (reg & (1u << matchtable[i].bit))
+			data->flags[matchtable[i].feature] = 1;
+}
+
+static void default_warn(const char *msg)
+{
+	fprintf(stderr, "%s", msg);
+}
+
+libcpuid_warn_fn_t _warn_fun = default_warn;
+
+#if defined(_MSC_VER)
+#	define vsnprintf _vsnprintf
+#endif
+void warnf(const char* format, ...)
+{
+	char buff[1024];
+	va_list va;
+	if (!_warn_fun) return;
+	va_start(va, format);
+	vsnprintf(buff, sizeof(buff), format, va);
+	va_end(va);
+	_warn_fun(buff);
+}
+
+void debugf(int verboselevel, const char* format, ...)
+{
+	char buff[1024];
+	va_list va;
+	if (verboselevel > _current_verboselevel) return;
+	va_start(va, format);
+	vsnprintf(buff, sizeof(buff), format, va);
+	va_end(va);
+	_warn_fun(buff);
+}
+
+static int popcount64(uint64_t mask)
+{
+	int num_set_bits = 0;
+	
+	while (mask) {
+		mask &= mask - 1;
+		num_set_bits++;
+	}
+	
+	return num_set_bits;
+}
+
+static int score(const struct match_entry_t* entry, const struct cpu_id_t* data,
+                 int brand_code, uint64_t bits, int model_code)
+{
+	int res = 0;
+	if (entry->family	== data->family    ) res += 2;
+	if (entry->model	== data->model     ) res += 2;
+	if (entry->stepping	== data->stepping  ) res += 2;
+	if (entry->ext_family	== data->ext_family) res += 2;
+	if (entry->ext_model	== data->ext_model ) res += 2;
+	if (entry->ncores	== data->num_cores ) res += 2;
+	if (entry->l2cache	== data->l2_cache  ) res += 1;
+	if (entry->l3cache	== data->l3_cache  ) res += 1;
+	if (entry->brand_code   == brand_code  ) res += 2;
+	if (entry->model_code   == model_code  ) res += 2;
+	
+	res += popcount64(entry->model_bits & bits) * 2;
+	return res;
+}
+
+int match_cpu_codename(const struct match_entry_t* matchtable, int count,
+                       struct cpu_id_t* data, int brand_code, uint64_t bits,
+                       int model_code)
+{
+	int bestscore = -1;
+	int bestindex = 0;
+	int i, t;
+	
+	debugf(3, "Matching cpu f:%d, m:%d, s:%d, xf:%d, xm:%d, ncore:%d, l2:%d, bcode:%d, bits:%llu, code:%d\n",
+		data->family, data->model, data->stepping, data->ext_family,
+		data->ext_model, data->num_cores, data->l2_cache, brand_code, (unsigned long long) bits, model_code);
+	
+	for (i = 0; i < count; i++) {
+		t = score(&matchtable[i], data, brand_code, bits, model_code);
+		debugf(3, "Entry %d, `%s', score %d\n", i, matchtable[i].name, t);
+		if (t > bestscore) {
+			debugf(2, "Entry `%s' selected - best score so far (%d)\n", matchtable[i].name, t);
+			bestscore = t;
+			bestindex = i;
+		}
+	}
+	strcpy(data->cpu_codename, matchtable[bestindex].name);
+	return bestscore;
+}
+
+void generic_get_cpu_list(const struct match_entry_t* matchtable, int count,
+                          struct cpu_list_t* list)
+{
+	int i, j, n, good;
+	n = 0;
+	list->names = (char**) malloc(sizeof(char*) * count);
+	for (i = 0; i < count; i++) {
+		if (strstr(matchtable[i].name, "Unknown")) continue;
+		good = 1;
+		for (j = n - 1; j >= 0; j--)
+			if (!strcmp(list->names[j], matchtable[i].name)) {
+				good = 0;
+				break;
+			}
+		if (!good) continue;
+#if defined(_MSC_VER)
+		list->names[n++] = _strdup(matchtable[i].name);
+#else
+		list->names[n++] = strdup(matchtable[i].name);
+#endif
+	}
+	list->num_entries = n;
+}
+
+static int xmatch_entry(char c, const char* p)
+{
+	int i, j;
+	if (c == 0) return -1;
+	if (c == p[0]) return 1;
+	if (p[0] == '.') return 1;
+	if (p[0] == '#' && isdigit(c)) return 1;
+	if (p[0] == '[') {
+		j = 1;
+		while (p[j] && p[j] != ']') j++;
+		if (!p[j]) return -1;
+		for (i = 1; i < j; i++)
+			if (p[i] == c) return j + 1;
+	}
+	return -1;
+}
+
+int match_pattern(const char* s, const char* p)
+{
+	int i, j, dj, k, n, m;
+	n = (int) strlen(s);
+	m = (int) strlen(p);
+	for (i = 0; i < n; i++) {
+		if (xmatch_entry(s[i], p) != -1) {
+			j = 0;
+			k = 0;
+			while (j < m && ((dj = xmatch_entry(s[i + k], p + j)) != -1)) {
+				k++;
+				j += dj;
+			}
+			if (j == m) return i + 1;
+		}
+	}
+	return 0;
+}
+
+struct cpu_id_t* get_cached_cpuid(void)
+{
+	static int initialized = 0;
+	static struct cpu_id_t id;
+	if (initialized) return &id;
+	if (cpu_identify(NULL, &id))
+		memset(&id, 0, sizeof(id));
+	initialized = 1;
+	return &id;
+}
+
+int match_all(uint64_t bits, uint64_t mask)
+{
+	return (bits & mask) == mask;
+}
+
+void debug_print_lbits(int debuglevel, uint64_t mask)
+{
+	int i, first = 0;
+	for (i = 0; i < 64; i++) if (mask & (((uint64_t) 1) << i)) {
+		if (first) first = 0;
+		else debugf(2, " + ");
+		debugf(2, "LBIT(%d)", i);
+	}
+	debugf(2, "\n");
+}
--- a/compat/libcpuid/libcpuid_util.h
+++ b/compat/libcpuid/libcpuid_util.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __LIBCPUID_UTIL_H__
+#define __LIBCPUID_UTIL_H__
+
+#define COUNT_OF(array) (sizeof(array) / sizeof(array[0]))
+
+#define LBIT(x) (((long long) 1) << x)
+
+struct feature_map_t {
+	unsigned bit;
+	cpu_feature_t feature;
+};
+ 
+void match_features(const struct feature_map_t* matchtable, int count,
+                    uint32_t reg, struct cpu_id_t* data);
+
+struct match_entry_t {
+	int family, model, stepping, ext_family, ext_model;
+	int ncores, l2cache, l3cache, brand_code;
+	uint64_t model_bits;
+	int model_code;
+	char name[32];
+};
+
+// returns the match score:
+int match_cpu_codename(const struct match_entry_t* matchtable, int count,
+                       struct cpu_id_t* data, int brand_code, uint64_t bits,
+                       int model_code);
+
+void warnf(const char* format, ...)
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+;
+void debugf(int verboselevel, const char* format, ...)
+#ifdef __GNUC__
+__attribute__((format(printf, 2, 3)))
+#endif
+;
+void generic_get_cpu_list(const struct match_entry_t* matchtable, int count,
+                          struct cpu_list_t* list);
+
+/*
+ * Seek for a pattern in `haystack'.
+ * Pattern may be an fixed string, or contain the special metacharacters
+ * '.' - match any single character
+ * '#' - match any digit
+ * '[<chars>] - match any of the given chars (regex-like ranges are not
+ *              supported)
+ * Return val: 0 if the pattern is not found. Nonzero if it is found (actually,
+ *             x + 1 where x is the index where the match is found).
+ */
+int match_pattern(const char* haystack, const char* pattern);
+
+/*
+ * Gets an initialized cpu_id_t. It is cached, so that internal libcpuid
+ * machinery doesn't need to issue cpu_identify more than once.
+ */
+struct cpu_id_t* get_cached_cpuid(void);
+
+
+/* returns true if all bits of mask are present in `bits'. */
+int match_all(uint64_t bits, uint64_t mask);
+
+/* print what bits a mask consists of */
+void debug_print_lbits(int debuglevel, uint64_t mask);
+
+/*
+ * Sets the current errno
+ */
+int set_error(cpu_error_t err);
+
+extern libcpuid_warn_fn_t _warn_fun;
+extern int _current_verboselevel;
+
+#endif /* __LIBCPUID_UTIL_H__ */
--- a/compat/libcpuid/recog_amd.c
+++ b/compat/libcpuid/recog_amd.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include "libcpuid.h"
+#include "libcpuid_util.h"
+#include "libcpuid_internal.h"
+#include "recog_amd.h"
+
+const struct amd_code_str { amd_code_t code; char *str; } amd_code_str[] = {
+	#define CODE(x) { x, #x }
+	#define CODE2(x, y) CODE(x)
+	#include "amd_code_t.h"
+	#undef CODE
+};
+
+struct amd_code_and_bits_t {
+	int code;
+	uint64_t bits;
+};
+
+enum _amd_bits_t {
+	ATHLON_      = LBIT(  0 ),
+	_XP_         = LBIT(  1 ),
+	_M_          = LBIT(  2 ),
+	_MP_         = LBIT(  3 ),
+	MOBILE_      = LBIT(  4 ),
+	DURON_       = LBIT(  5 ),
+	SEMPRON_     = LBIT(  6 ),
+	OPTERON_     = LBIT(  7 ),
+	TURION_      = LBIT(  8 ),
+	_LV_         = LBIT(  9 ),
+	_64_         = LBIT( 10 ),
+	_X2          = LBIT( 11 ),
+	_X3          = LBIT( 12 ),
+	_X4          = LBIT( 13 ),
+	_X6          = LBIT( 14 ),
+	_FX          = LBIT( 15 ),
+};
+typedef enum _amd_bits_t amd_bits_t;
+
+enum _amd_model_codes_t {
+	// Only for Ryzen CPUs:
+	_1400,
+	_1500,
+	_1600,
+};
+
+
+const struct match_entry_t cpudb_amd[] = {
+	{ -1, -1, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown AMD CPU"               },
+	
+	/* 486 and the likes */
+	{  4, -1, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown AMD 486"               },
+	{  4,  3, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "AMD 486DX2"                    },
+	{  4,  7, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "AMD 486DX2WB"                  },
+	{  4,  8, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "AMD 486DX4"                    },
+	{  4,  9, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "AMD 486DX4WB"                  },
+	
+	/* Pentia clones */
+	{  5, -1, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown AMD 586"               },
+	{  5,  0, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K5"                            },
+	{  5,  1, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K5"                            },
+	{  5,  2, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K5"                            },
+	{  5,  3, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K5"                            },
+	
+	/* The K6 */
+	{  5,  6, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K6"                            },
+	{  5,  7, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K6"                            },
+	
+	{  5,  8, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K6-2"                          },
+	{  5,  9, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K6-III"                        },
+	{  5, 10, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown K6"                    },
+	{  5, 11, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown K6"                    },
+	{  5, 12, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown K6"                    },
+	{  5, 13, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "K6-2+"                         },
+	
+	/* Athlon et al. */
+	{  6,  1, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Athlon (Slot-A)"               },
+	{  6,  2, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Athlon (Slot-A)"               },
+	{  6,  3, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Duron (Spitfire)"              },
+	{  6,  4, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Athlon (ThunderBird)"          },
+	
+	{  6,  6, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown Athlon"                },
+	{  6,  6, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_             ,     0, "Athlon (Palomino)"             },
+	{  6,  6, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_MP_        ,     0, "Athlon MP (Palomino)"          },
+	{  6,  6, -1, -1,   -1,   1,    -1,    -1, NC, DURON_              ,     0, "Duron (Palomino)"              },
+	{  6,  6, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_XP_        ,     0, "Athlon XP"                     },
+	
+	{  6,  7, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown Athlon XP"             },
+	{  6,  7, -1, -1,   -1,   1,    -1,    -1, NC, DURON_              ,     0, "Duron (Morgan)"                },
+	
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Athlon XP"                     },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_             ,     0, "Athlon XP (Thoroughbred)"      },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_XP_        ,     0, "Athlon XP (Thoroughbred)"      },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, DURON_              ,     0, "Duron (Applebred)"             },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, SEMPRON_            ,     0, "Sempron (Thoroughbred)"        },
+	{  6,  8, -1, -1,   -1,   1,   128,    -1, NC, SEMPRON_            ,     0, "Sempron (Thoroughbred)"        },
+	{  6,  8, -1, -1,   -1,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron (Thoroughbred)"        },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_MP_        ,     0, "Athlon MP (Thoroughbred)"      },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_XP_|_M_    ,     0, "Mobile Athlon (T-Bred)"        },
+	{  6,  8, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_XP_|_M_|_LV_,    0, "Mobile Athlon (T-Bred)"        },
+	
+	{  6, 10, -1, -1,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Athlon XP (Barton)"            },
+	{  6, 10, -1, -1,   -1,   1,   512,    -1, NC, ATHLON_|_XP_        ,     0, "Athlon XP (Barton)"            },
+	{  6, 10, -1, -1,   -1,   1,   512,    -1, NC, SEMPRON_            ,     0, "Sempron (Barton)"              },
+	{  6, 10, -1, -1,   -1,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron (Thorton)"             },
+	{  6, 10, -1, -1,   -1,   1,   256,    -1, NC, ATHLON_|_XP_        ,     0, "Athlon XP (Thorton)"           },
+	{  6, 10, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_MP_        ,     0, "Athlon MP (Barton)"            },
+	{  6, 10, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_XP_|_M_    ,     0, "Mobile Athlon (Barton)"        },
+	{  6, 10, -1, -1,   -1,   1,    -1,    -1, NC, ATHLON_|_XP_|_M_|_LV_,    0, "Mobile Athlon (Barton)"        },
+	
+	/* K8 Architecture */
+	{ 15, -1, -1, 15,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown K8"                    },
+	{ 15, -1, -1, 16,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown K9"                    },
+	
+	{ 15, -1, -1, 15,   -1,   1,    -1,    -1, NC, 0                   ,     0, "Unknown A64"                   },
+	{ 15, -1, -1, 15,   -1,   1,    -1,    -1, NC, OPTERON_            ,     0, "Opteron"                       },
+	{ 15, -1, -1, 15,   -1,   2,    -1,    -1, NC, OPTERON_|_X2        ,     0, "Opteron (Dual Core)"           },
+	{ 15,  3, -1, 15,   -1,   1,    -1,    -1, NC, OPTERON_            ,     0, "Opteron"                       },
+	{ 15,  3, -1, 15,   -1,   2,    -1,    -1, NC, OPTERON_|_X2        ,     0, "Opteron (Dual Core)"           },
+	{ 15, -1, -1, 15,   -1,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (512K)"              },
+	{ 15, -1, -1, 15,   -1,   1,  1024,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (1024K)"             },
+	{ 15, -1, -1, 15,   -1,   1,    -1,    -1, NC, ATHLON_|_FX         ,     0, "Athlon FX"                     },
+	{ 15, -1, -1, 15,   -1,   1,    -1,    -1, NC, ATHLON_|_64_|_FX    ,     0, "Athlon 64 FX"                  },
+	{ 15,  3, -1, 15,   35,   2,    -1,    -1, NC, ATHLON_|_64_|_FX    ,     0, "Athlon 64 FX X2 (Toledo)"      },
+	{ 15, -1, -1, 15,   -1,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (512K)"           },
+	{ 15, -1, -1, 15,   -1,   2,  1024,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (1024K)"          },
+	{ 15, -1, -1, 15,   -1,   1,   512,    -1, NC, TURION_|_64_        ,     0, "Turion 64 (512K)"              },
+	{ 15, -1, -1, 15,   -1,   1,  1024,    -1, NC, TURION_|_64_        ,     0, "Turion 64 (1024K)"             },
+	{ 15, -1, -1, 15,   -1,   2,   512,    -1, NC, TURION_|_X2         ,     0, "Turion 64 X2 (512K)"           },
+	{ 15, -1, -1, 15,   -1,   2,  1024,    -1, NC, TURION_|_X2         ,     0, "Turion 64 X2 (1024K)"          },
+	{ 15, -1, -1, 15,   -1,   1,   128,    -1, NC, SEMPRON_            ,     0, "A64 Sempron (128K)"            },
+	{ 15, -1, -1, 15,   -1,   1,   256,    -1, NC, SEMPRON_            ,     0, "A64 Sempron (256K)"            },
+	{ 15, -1, -1, 15,   -1,   1,   512,    -1, NC, SEMPRON_            ,     0, "A64 Sempron (512K)"            },
+	{ 15, -1, -1, 15, 0x4f,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Orleans/512K)"      },
+	{ 15, -1, -1, 15, 0x5f,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Orleans/512K)"      },
+	{ 15, -1, -1, 15, 0x2f,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Venice/512K)"       },
+	{ 15, -1, -1, 15, 0x2c,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Venice/512K)"       },
+	{ 15, -1, -1, 15, 0x1f,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Winchester/512K)"   },
+	{ 15, -1, -1, 15, 0x0c,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Newcastle/512K)"    },
+	{ 15, -1, -1, 15, 0x27,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (San Diego/512K)"    },
+	{ 15, -1, -1, 15, 0x37,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (San Diego/512K)"    },
+	{ 15, -1, -1, 15, 0x04,   1,   512,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (ClawHammer/512K)"   },
+	
+	{ 15, -1, -1, 15, 0x5f,   1,  1024,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (Orleans/1024K)"     },
+	{ 15, -1, -1, 15, 0x27,   1,  1024,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (San Diego/1024K)"   },
+	{ 15, -1, -1, 15, 0x04,   1,  1024,    -1, NC, ATHLON_|_64_        ,     0, "Athlon 64 (ClawHammer/1024K)"  },
+	
+	{ 15, -1, -1, 15, 0x4b,   2,   256,    -1, NC, SEMPRON_            ,     0, "Athlon 64 X2 (Windsor/256K)"   },
+	
+	{ 15, -1, -1, 15, 0x23,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Toledo/512K)"    },
+	{ 15, -1, -1, 15, 0x4b,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Windsor/512K)"   },
+	{ 15, -1, -1, 15, 0x43,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Windsor/512K)"   },
+	{ 15, -1, -1, 15, 0x6b,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Brisbane/512K)"  },
+	{ 15, -1, -1, 15, 0x2b,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Manchester/512K)"},
+	
+	{ 15, -1, -1, 15, 0x23,   2,  1024,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Toledo/1024K)"   },
+	{ 15, -1, -1, 15, 0x43,   2,  1024,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon 64 X2 (Windsor/1024K)"  },
+	
+	{ 15, -1, -1, 15, 0x08,   1,   128,    -1, NC, MOBILE_|SEMPRON_    ,     0, "Mobile Sempron 64 (Dublin/128K)"},
+	{ 15, -1, -1, 15, 0x08,   1,   256,    -1, NC, MOBILE_|SEMPRON_    ,     0, "Mobile Sempron 64 (Dublin/256K)"},
+	{ 15, -1, -1, 15, 0x0c,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Paris)"            },
+	{ 15, -1, -1, 15, 0x1c,   1,   128,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Palermo/128K)"     },
+	{ 15, -1, -1, 15, 0x1c,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Palermo/256K)"     },
+	{ 15, -1, -1, 15, 0x1c,   1,   128,    -1, NC, MOBILE_| SEMPRON_   ,     0, "Mobile Sempron 64 (Sonora/128K)"},
+	{ 15, -1, -1, 15, 0x1c,   1,   256,    -1, NC, MOBILE_| SEMPRON_   ,     0, "Mobile Sempron 64 (Sonora/256K)"},
+	{ 15, -1, -1, 15, 0x2c,   1,   128,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Palermo/128K)"     },
+	{ 15, -1, -1, 15, 0x2c,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Palermo/256K)"     },
+	{ 15, -1, -1, 15, 0x2c,   1,   128,    -1, NC, MOBILE_| SEMPRON_   ,     0, "Mobile Sempron 64 (Albany/128K)"},
+	{ 15, -1, -1, 15, 0x2c,   1,   256,    -1, NC, MOBILE_| SEMPRON_   ,     0, "Mobile Sempron 64 (Albany/256K)"},
+	{ 15, -1, -1, 15, 0x2f,   1,   128,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Palermo/128K)"     },
+	{ 15, -1, -1, 15, 0x2f,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Palermo/256K)"     },
+	{ 15, -1, -1, 15, 0x4f,   1,   128,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Manila/128K)"      },
+	{ 15, -1, -1, 15, 0x4f,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Manila/256K)"      },
+	{ 15, -1, -1, 15, 0x5f,   1,   128,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Manila/128K)"      },
+	{ 15, -1, -1, 15, 0x5f,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Manila/256K)"      },
+	{ 15, -1, -1, 15, 0x6b,   2,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 Dual (Sherman/256K)"},
+	{ 15, -1, -1, 15, 0x6b,   2,   512,    -1, NC, SEMPRON_            ,     0, "Sempron 64 Dual (Sherman/512K)"},
+	{ 15, -1, -1, 15, 0x7f,   1,   256,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Sparta/256K)"      },
+	{ 15, -1, -1, 15, 0x7f,   1,   512,    -1, NC, SEMPRON_            ,     0, "Sempron 64 (Sparta/512K)"      },
+	{ 15, -1, -1, 15, 0x4c,   1,   256,    -1, NC, MOBILE_| SEMPRON_   ,     0, "Mobile Sempron 64 (Keene/256K)"},
+	{ 15, -1, -1, 15, 0x4c,   1,   512,    -1, NC, MOBILE_| SEMPRON_   ,     0, "Mobile Sempron 64 (Keene/512K)"},
+	{ 15, -1, -1, 15,   -1,   2,    -1,    -1, NC, SEMPRON_            ,     0, "Sempron Dual Core"             },
+	
+	{ 15, -1, -1, 15, 0x24,   1,   512,    -1, NC, TURION_|_64_        ,     0, "Turion 64 (Lancaster/512K)"    },
+	{ 15, -1, -1, 15, 0x24,   1,  1024,    -1, NC, TURION_|_64_        ,     0, "Turion 64 (Lancaster/1024K)"   },
+	{ 15, -1, -1, 15, 0x48,   2,   256,    -1, NC, TURION_|_X2         ,     0, "Turion X2 (Taylor)"            },
+	{ 15, -1, -1, 15, 0x48,   2,   512,    -1, NC, TURION_|_X2         ,     0, "Turion X2 (Trinidad)"          },
+	{ 15, -1, -1, 15, 0x4c,   1,   512,    -1, NC, TURION_|_64_        ,     0, "Turion 64 (Richmond)"          },
+	{ 15, -1, -1, 15, 0x68,   2,   256,    -1, NC, TURION_|_X2         ,     0, "Turion X2 (Tyler/256K)"        },
+	{ 15, -1, -1, 15, 0x68,   2,   512,    -1, NC, TURION_|_X2         ,     0, "Turion X2 (Tyler/512K)"        },
+	{ 15, -1, -1, 17,    3,   2,   512,    -1, NC, TURION_|_X2         ,     0, "Turion X2 (Griffin/512K)"      },
+	{ 15, -1, -1, 17,    3,   2,  1024,    -1, NC, TURION_|_X2         ,     0, "Turion X2 (Griffin/1024K)"     },
+
+	/* K10 Architecture (2007) */
+	{ 15, -1, -1, 16,   -1,   1,    -1,    -1, PHENOM, 0               ,     0, "Unknown AMD Phenom"            },
+	{ 15,  2, -1, 16,   -1,   1,    -1,    -1, PHENOM, 0               ,     0, "Phenom"                        },
+	{ 15,  2, -1, 16,   -1,   3,    -1,    -1, PHENOM, 0               ,     0, "Phenom X3 (Toliman)"           },
+	{ 15,  2, -1, 16,   -1,   4,    -1,    -1, PHENOM, 0               ,     0, "Phenom X4 (Agena)"             },
+	{ 15,  2, -1, 16,   -1,   3,   512,    -1, PHENOM, 0               ,     0, "Phenom X3 (Toliman/256K)"      },
+	{ 15,  2, -1, 16,   -1,   3,   512,    -1, PHENOM, 0               ,     0, "Phenom X3 (Toliman/512K)"      },
+	{ 15,  2, -1, 16,   -1,   4,   128,    -1, PHENOM, 0               ,     0, "Phenom X4 (Agena/128K)"        },
+	{ 15,  2, -1, 16,   -1,   4,   256,    -1, PHENOM, 0               ,     0, "Phenom X4 (Agena/256K)"        },
+	{ 15,  2, -1, 16,   -1,   4,   512,    -1, PHENOM,  0              ,     0, "Phenom X4 (Agena/512K)"        },
+	{ 15,  2, -1, 16,   -1,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon X2 (Kuma)"              },
+	/* Phenom II derivates: */
+	{ 15,  4, -1, 16,   -1,   4,    -1,    -1, NC, 0                   ,     0, "Phenom (Deneb-based)"          },
+	{ 15,  4, -1, 16,   -1,   1,  1024,    -1, NC, SEMPRON_            ,     0, "Sempron (Sargas)"              },
+	{ 15,  4, -1, 16,   -1,   2,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X2 (Callisto)"       },
+	{ 15,  4, -1, 16,   -1,   3,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X3 (Heka)"           },
+	{ 15,  4, -1, 16,   -1,   4,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X4"                  },
+	{ 15,  4, -1, 16,    4,   4,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X4 (Deneb)"          },
+	{ 15,  5, -1, 16,    5,   4,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X4 (Deneb)"          },
+	{ 15,  4, -1, 16,   10,   4,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X4 (Zosma)"          },
+	{ 15,  4, -1, 16,   10,   6,   512,    -1, PHENOM2, 0              ,     0, "Phenom II X6 (Thuban)"         },
+	/* Athlon II derivates: */
+	{ 15,  6, -1, 16,    6,   2,   512,    -1, NC, ATHLON_|_X2         ,     0, "Athlon II (Champlain)"         },
+	{ 15,  6, -1, 16,    6,   2,   512,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon II X2 (Regor)"          },
+	{ 15,  6, -1, 16,    6,   2,  1024,    -1, NC, ATHLON_|_64_|_X2    ,     0, "Athlon II X2 (Regor)"          },
+	{ 15,  5, -1, 16,    5,   3,   512,    -1, NC, ATHLON_|_64_|_X3    ,     0, "Athlon II X3 (Rana)"           },
+	{ 15,  5, -1, 16,    5,   4,   512,    -1, NC, ATHLON_|_64_|_X4    ,     0, "Athlon II X4 (Propus)"         },
+	/* Llano APUs (2011): */
+	{ 15,  1, -1, 18,    1,   2,    -1,    -1, FUSION_EA, 0            ,     0, "Llano X2"                      },
+	{ 15,  1, -1, 18,    1,   3,    -1,    -1, FUSION_EA, 0            ,     0, "Llano X3"                      },
+	{ 15,  1, -1, 18,    1,   4,    -1,    -1, FUSION_EA, 0            ,     0, "Llano X4"                      },
+
+	/* Family 14h: Bobcat Architecture (2011) */
+	{ 15,  2, -1, 20,   -1,   1,    -1,    -1, FUSION_C, 0             ,     0, "Brazos Ontario"                },
+	{ 15,  2, -1, 20,   -1,   2,    -1,    -1, FUSION_C, 0             ,     0, "Brazos Ontario (Dual-core)"    },
+	{ 15,  1, -1, 20,   -1,   1,    -1,    -1, FUSION_E, 0             ,     0, "Brazos Zacate"                 },
+	{ 15,  1, -1, 20,   -1,   2,    -1,    -1, FUSION_E, 0             ,     0, "Brazos Zacate (Dual-core)"     },
+	{ 15,  2, -1, 20,   -1,   2,    -1,    -1, FUSION_Z, 0             ,     0, "Brazos Desna (Dual-core)"      },
+
+	/* Family 15h: Bulldozer Architecture (2011) */
+	{ 15, -1, -1, 21,    0,   4,    -1,    -1, NC, 0                   ,     0, "Bulldozer X2"                  },
+	{ 15, -1, -1, 21,    1,   4,    -1,    -1, NC, 0                   ,     0, "Bulldozer X2"                  },
+	{ 15, -1, -1, 21,    1,   6,    -1,    -1, NC, 0                   ,     0, "Bulldozer X3"                  },
+	{ 15, -1, -1, 21,    1,   8,    -1,    -1, NC, 0                   ,     0, "Bulldozer X4"                  },
+	/* 2nd-gen, Piledriver core (2012): */
+	{ 15, -1, -1, 21,    2,   4,    -1,    -1, NC, 0                   ,     0, "Vishera X2"                    },
+	{ 15, -1, -1, 21,    2,   6,    -1,    -1, NC, 0                   ,     0, "Vishera X3"                    },
+	{ 15, -1, -1, 21,    2,   8,    -1,    -1, NC, 0                   ,     0, "Vishera X4"                    },
+	{ 15,  0, -1, 21,   16,   2,    -1,    -1, FUSION_A, 0             ,     0, "Trinity X2"                    },
+	{ 15,  0, -1, 21,   16,   4,    -1,    -1, FUSION_A, 0             ,     0, "Trinity X4"                    },
+	{ 15,  3, -1, 21,   19,   2,    -1,    -1, FUSION_A, 0             ,     0, "Richland X2"                   },
+	{ 15,  3, -1, 21,   19,   4,    -1,    -1, FUSION_A, 0             ,     0, "Richland X4"                   },
+	/* 3rd-gen, Steamroller core (2014): */
+	{ 15,  0, -1, 21,   48,   2,    -1,    -1, FUSION_A, 0             ,     0, "Kaveri X2"                     },
+	{ 15,  0, -1, 21,   48,   4,    -1,    -1, FUSION_A, 0             ,     0, "Kaveri X4"                     },
+	{ 15,  8, -1, 21,   56,   4,    -1,    -1, FUSION_A, 0             ,     0, "Godavari X4"                   },
+	/* 4th-gen, Excavator core (2015): */
+	{ 15,  1, -1, 21,   96,   2,    -1,    -1, FUSION_A, 0             ,     0, "Carrizo X2"                    },
+	{ 15,  1, -1, 21,   96,   4,    -1,    -1, FUSION_A, 0             ,     0, "Carrizo X4"                    },
+	{ 15,  5, -1, 21,  101,   2,    -1,    -1, FUSION_A, 0             ,     0, "Bristol Ridge X2"              },
+	{ 15,  5, -1, 21,  101,   4,    -1,    -1, FUSION_A, 0             ,     0, "Bristol Ridge X4"              },
+	{ 15,  0, -1, 21,  112,   2,    -1,    -1, FUSION_A, 0             ,     0, "Stoney Ridge X2"               },
+	{ 15,  0, -1, 21,  112,   2,    -1,    -1, FUSION_E, 0             ,     0, "Stoney Ridge X2"               },
+
+	/* Family 16h: Jaguar Architecture (2013) */
+	{ 15,  0, -1, 22,    0,   2,    -1,    -1, FUSION_A, 0             ,     0, "Kabini X2"                     },
+	{ 15,  0, -1, 22,    0,   4,    -1,    -1, FUSION_A, 0             ,     0, "Kabini X4"                     },
+	/* 2nd-gen, Puma core (2013): */
+	{ 15,  0, -1, 22,   48,   2,    -1,    -1, FUSION_E, 0             ,     0, "Mullins X2"                    },
+	{ 15,  0, -1, 22,   48,   4,    -1,    -1, FUSION_A, 0             ,     0, "Mullins X4"                    },
+
+	/* Family 17h: Zen Architecture (2017) */
+	{ 15, -1, -1, 23,    1,   8,    -1,    -1, NC, 0                   ,     0, "Ryzen 7"                       },
+	{ 15, -1, -1, 23,    1,   6,    -1,    -1, NC, 0                   , _1600, "Ryzen 5"                       },
+	{ 15, -1, -1, 23,    1,   4,    -1,    -1, NC, 0                   , _1500, "Ryzen 5"                       },
+	{ 15, -1, -1, 23,    1,   4,    -1,    -1, NC, 0                   , _1400, "Ryzen 5"                       },
+	{ 15, -1, -1, 23,    1,   4,    -1,    -1, NC, 0                   ,     0, "Ryzen 3"                       },
+	//{ 15, -1, -1, 23,    1,   4,    -1,    -1, NC, 0                   ,     0, "Raven Ridge"                   }, //TBA
+
+	/* Newer Opterons: */
+	{ 15,  9, -1, 22,    9,   8,    -1,    -1, NC, OPTERON_            ,     0, "Magny-Cours Opteron"           },
+};
+
+
+static void load_amd_features(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	const struct feature_map_t matchtable_edx81[] = {
+		{ 20, CPU_FEATURE_NX },
+		{ 22, CPU_FEATURE_MMXEXT },
+		{ 25, CPU_FEATURE_FXSR_OPT },
+		{ 30, CPU_FEATURE_3DNOWEXT },
+		{ 31, CPU_FEATURE_3DNOW },
+	};
+	const struct feature_map_t matchtable_ecx81[] = {
+		{  1, CPU_FEATURE_CMP_LEGACY },
+		{  2, CPU_FEATURE_SVM },
+		{  5, CPU_FEATURE_ABM },
+		{  6, CPU_FEATURE_SSE4A },
+		{  7, CPU_FEATURE_MISALIGNSSE },
+		{  8, CPU_FEATURE_3DNOWPREFETCH },
+		{  9, CPU_FEATURE_OSVW },
+		{ 10, CPU_FEATURE_IBS },
+		{ 11, CPU_FEATURE_XOP },
+		{ 12, CPU_FEATURE_SKINIT },
+		{ 13, CPU_FEATURE_WDT },
+		{ 16, CPU_FEATURE_FMA4 },
+		{ 21, CPU_FEATURE_TBM },
+	};
+	const struct feature_map_t matchtable_edx87[] = {
+		{  0, CPU_FEATURE_TS },
+		{  1, CPU_FEATURE_FID },
+		{  2, CPU_FEATURE_VID },
+		{  3, CPU_FEATURE_TTP },
+		{  4, CPU_FEATURE_TM_AMD },
+		{  5, CPU_FEATURE_STC },
+		{  6, CPU_FEATURE_100MHZSTEPS },
+		{  7, CPU_FEATURE_HWPSTATE },
+		/* id 8 is handled in common */
+		{  9, CPU_FEATURE_CPB },
+		{ 10, CPU_FEATURE_APERFMPERF },
+		{ 11, CPU_FEATURE_PFI },
+		{ 12, CPU_FEATURE_PA },
+	};
+	if (raw->ext_cpuid[0][0] >= 0x80000001) {
+		match_features(matchtable_edx81, COUNT_OF(matchtable_edx81), raw->ext_cpuid[1][3], data);
+		match_features(matchtable_ecx81, COUNT_OF(matchtable_ecx81), raw->ext_cpuid[1][2], data);
+	}
+	if (raw->ext_cpuid[0][0] >= 0x80000007)
+		match_features(matchtable_edx87, COUNT_OF(matchtable_edx87), raw->ext_cpuid[7][3], data);
+	if (raw->ext_cpuid[0][0] >= 0x8000001a) {
+		/* We have the extended info about SSE unit size */
+		data->detection_hints[CPU_HINT_SSE_SIZE_AUTH] = 1;
+		data->sse_size = (raw->ext_cpuid[0x1a][0] & 1) ? 128 : 64;
+	}
+}
+
+static void decode_amd_cache_info(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	int l3_result;
+	const int assoc_table[16] = {
+		0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 255
+	};
+	unsigned n = raw->ext_cpuid[0][0];
+	
+	if (n >= 0x80000005) {
+		data->l1_data_cache = (raw->ext_cpuid[5][2] >> 24) & 0xff;
+		data->l1_assoc = (raw->ext_cpuid[5][2] >> 16) & 0xff;
+		data->l1_cacheline = (raw->ext_cpuid[5][2]) & 0xff;
+		data->l1_instruction_cache = (raw->ext_cpuid[5][3] >> 24) & 0xff;
+	}
+	if (n >= 0x80000006) {
+		data->l2_cache = (raw->ext_cpuid[6][2] >> 16) & 0xffff;
+		data->l2_assoc = assoc_table[(raw->ext_cpuid[6][2] >> 12) & 0xf];
+		data->l2_cacheline = (raw->ext_cpuid[6][2]) & 0xff;
+		
+		l3_result = (raw->ext_cpuid[6][3] >> 18);
+		if (l3_result > 0) {
+			l3_result = 512 * l3_result; /* AMD spec says it's a range,
+			                                but we take the lower bound */
+			data->l3_cache = l3_result;
+			data->l3_assoc = assoc_table[(raw->ext_cpuid[6][3] >> 12) & 0xf];
+			data->l3_cacheline = (raw->ext_cpuid[6][3]) & 0xff;
+		} else {
+			data->l3_cache = 0;
+		}
+	}
+}
+
+static void decode_amd_number_of_cores(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	int logical_cpus = -1, num_cores = -1;
+	
+	if (raw->basic_cpuid[0][0] >= 1) {
+		logical_cpus = (raw->basic_cpuid[1][1] >> 16) & 0xff;
+		if (raw->ext_cpuid[0][0] >= 8) {
+			num_cores = 1 + (raw->ext_cpuid[8][2] & 0xff);
+		}
+	}
+	if (data->flags[CPU_FEATURE_HT]) {
+		if (num_cores > 1) {
+			if (data->ext_family >= 23)
+				num_cores /= 2; // e.g., Ryzen 7 reports 16 "real" cores, but they are really just 8.
+			data->num_cores = num_cores;
+			data->num_logical_cpus = logical_cpus;
+		} else {
+			data->num_cores = 1;
+			data->num_logical_cpus = (logical_cpus >= 2 ? logical_cpus : 2);
+		}
+	} else {
+		data->num_cores = data->num_logical_cpus = 1;
+	}
+}
+
+static int amd_has_turion_modelname(const char *bs)
+{
+	/* We search for something like TL-60. Ahh, I miss regexes...*/
+	int i, l, k;
+	char code[3] = {0};
+	const char* codes[] = { "ML", "MT", "MK", "TK", "TL", "RM", "ZM", "" };
+	l = (int) strlen(bs);
+	for (i = 3; i < l - 2; i++) {
+		if (bs[i] == '-' &&
+		    isupper(bs[i-1]) && isupper(bs[i-2]) && !isupper(bs[i-3]) &&
+		    isdigit(bs[i+1]) && isdigit(bs[i+2]) && !isdigit(bs[i+3]))
+		{
+			code[0] = bs[i-2];
+			code[1] = bs[i-1];
+			for (k = 0; codes[k][0]; k++)
+				if (!strcmp(codes[k], code)) return 1;
+		}
+	}
+	return 0;
+}
+
+static struct amd_code_and_bits_t decode_amd_codename_part1(const char *bs)
+{
+	amd_code_t code = NC;
+	uint64_t bits = 0;
+	struct amd_code_and_bits_t result;
+
+	if (strstr(bs, "Dual Core") ||
+	    strstr(bs, "Dual-Core") ||
+	    strstr(bs, " X2 "))
+		bits |= _X2;
+	if (strstr(bs, " X4 ")) bits |= _X4;
+	if (strstr(bs, " X3 ")) bits |= _X3;
+	if (strstr(bs, "Opteron")) bits |= OPTERON_;
+	if (strstr(bs, "Phenom")) {
+		code = (strstr(bs, "II")) ? PHENOM2 : PHENOM;
+	}
+	if (amd_has_turion_modelname(bs)) {
+		bits |= TURION_;
+	}
+	if (strstr(bs, "Athlon(tm)")) bits |= ATHLON_;
+	if (strstr(bs, "Sempron(tm)")) bits |= SEMPRON_;
+	if (strstr(bs, "Duron")) bits |= DURON_;
+	if (strstr(bs, " 64 ")) bits |= _64_;
+	if (strstr(bs, " FX")) bits |= _FX;
+	if (strstr(bs, " MP")) bits |= _MP_;
+	if (strstr(bs, "Athlon(tm) 64") || strstr(bs, "Athlon(tm) II X") || match_pattern(bs, "Athlon(tm) X#")) {
+		bits |= ATHLON_ | _64_;
+	}
+	if (strstr(bs, "Turion")) bits |= TURION_;
+	
+	if (strstr(bs, "mobile") || strstr(bs, "Mobile")) {
+		bits |= MOBILE_;
+	}
+	
+	if (strstr(bs, "XP")) bits |= _XP_;
+	if (strstr(bs, "XP-M")) bits |= _M_;
+	if (strstr(bs, "(LV)")) bits |= _LV_;
+
+	if (match_pattern(bs, "C-##")) code = FUSION_C;
+	if (match_pattern(bs, "E-###")) code = FUSION_E;
+	if (match_pattern(bs, "Z-##")) code = FUSION_Z;
+	if (match_pattern(bs, "E#-####") || match_pattern(bs, "A#-####")) code = FUSION_EA;
+
+	result.code = code;
+	result.bits = bits;
+	return result;
+}
+
+static int decode_amd_ryzen_model_code(const char* bs)
+{
+	const struct {
+		int model_code;
+		const char* match_str;
+	} patterns[] = {
+		{ _1600, "1600" },
+		{ _1500, "1500" },
+		{ _1400, "1400" },
+	};
+	int i;
+
+	for (i = 0; i < COUNT_OF(patterns); i++)
+		if (strstr(bs, patterns[i].match_str))
+			return patterns[i].model_code;
+	//
+	return 0;
+}
+
+static void decode_amd_codename(struct cpu_raw_data_t* raw, struct cpu_id_t* data, struct internal_id_info_t* internal)
+{
+	struct amd_code_and_bits_t code_and_bits = decode_amd_codename_part1(data->brand_str);
+	int i = 0;
+	char* code_str = NULL;
+	int model_code;
+
+	for (i = 0; i < COUNT_OF(amd_code_str); i++) {
+		if (code_and_bits.code == amd_code_str[i].code) {
+			code_str = amd_code_str[i].str;
+			break;
+		}
+	}
+	if (/*code == ATHLON_64_X2*/ match_all(code_and_bits.bits, ATHLON_|_64_|_X2) && data->l2_cache < 512) {
+		code_and_bits.bits &= ~(ATHLON_ | _64_);
+		code_and_bits.bits |= SEMPRON_;
+	}
+	if (code_str)
+		debugf(2, "Detected AMD brand code: %d (%s)\n", code_and_bits.code, code_str);
+	else
+		debugf(2, "Detected AMD brand code: %d\n", code_and_bits.code);
+
+	if (code_and_bits.bits) {
+		debugf(2, "Detected AMD bits: ");
+		debug_print_lbits(2, code_and_bits.bits);
+	}
+	// is it Ryzen? if so, we need to detect discern between the four-core 1400/1500 (Ryzen 5) and the four-core Ryzen 3:
+	model_code = (data->ext_family == 23) ? decode_amd_ryzen_model_code(data->brand_str) : 0;
+
+	internal->code.amd = code_and_bits.code;
+	internal->bits = code_and_bits.bits;
+	internal->score = match_cpu_codename(cpudb_amd, COUNT_OF(cpudb_amd), data, code_and_bits.code,
+	                                     code_and_bits.bits, model_code);
+}
+
+int cpuid_identify_amd(struct cpu_raw_data_t* raw, struct cpu_id_t* data, struct internal_id_info_t* internal)
+{
+	load_amd_features(raw, data);
+	decode_amd_cache_info(raw, data);
+	decode_amd_number_of_cores(raw, data);
+	decode_amd_codename(raw, data, internal);
+	return 0;
+}
+
+void cpuid_get_list_amd(struct cpu_list_t* list)
+{
+	generic_get_cpu_list(cpudb_amd, COUNT_OF(cpudb_amd), list);
+}
--- a/compat/libcpuid/recog_amd.h
+++ b/compat/libcpuid/recog_amd.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __RECOG_AMD_H__
+#define __RECOG_AMD_H__
+
+int cpuid_identify_amd(struct cpu_raw_data_t* raw, struct cpu_id_t* data, struct internal_id_info_t* internal);
+void cpuid_get_list_amd(struct cpu_list_t* list);
+
+#endif /* __RECOG_AMD_H__ */
--- a/compat/libcpuid/recog_intel.c
+++ b/compat/libcpuid/recog_intel.c
@@ -0,0 +1,935 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <string.h>
+#include <ctype.h>
+#include "libcpuid.h"
+#include "libcpuid_util.h"
+#include "libcpuid_internal.h"
+#include "recog_intel.h"
+
+const struct intel_bcode_str { intel_code_t code; char *str; } intel_bcode_str[] = {
+	#define CODE(x) { x, #x }
+	#define CODE2(x, y) CODE(x)
+	#include "intel_code_t.h"
+	#undef CODE
+};
+
+typedef struct {
+	int code;
+	uint64_t bits;
+} intel_code_and_bits_t;
+
+enum _intel_model_t {
+	UNKNOWN = -1,
+	_3000 = 100,
+	_3100,
+	_3200,
+	X3200,
+	_3300,
+	X3300,
+	_5100,
+	_5200,
+	_5300,
+	_5400,
+	_2xxx, /* Core i[357] 2xxx */
+	_3xxx, /* Core i[357] 3xxx */
+};
+typedef enum _intel_model_t intel_model_t;
+
+enum _intel_bits_t {
+	PENTIUM_                = LBIT(  0 ),
+	CELERON_                = LBIT(  1 ),
+	MOBILE_                 = LBIT(  2 ),
+	CORE_                   = LBIT(  3 ),
+	_I_                     = LBIT(  4 ),
+	_M_                     = LBIT(  5 ),
+	_3                      = LBIT(  6 ),
+	_5                      = LBIT(  7 ),
+	_7                      = LBIT(  8 ),
+	XEON_                   = LBIT(  9 ),
+	_MP                     = LBIT( 10 ),
+	ATOM_                   = LBIT( 11 ),
+	
+};
+typedef enum _intel_bits_t intel_bits_t;
+
+const struct match_entry_t cpudb_intel[] = {
+	{ -1, -1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown Intel CPU"       },
+	
+	/* i486 */
+	{  4, -1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown i486"            },
+	{  4,  0, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 DX-25/33"           },
+	{  4,  1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 DX-50"              },
+	{  4,  2, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 SX"                 },
+	{  4,  3, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 DX2"                },
+	{  4,  4, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 SL"                 },
+	{  4,  5, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 SX2"                },
+	{  4,  7, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 DX2 WriteBack"      },
+	{  4,  8, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 DX4"                },
+	{  4,  9, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "i486 DX4 WriteBack"      },
+	
+	/* All Pentia:
+	   Pentium 1 */
+	{  5, -1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown Pentium"         },
+	{  5,  0, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium A-Step"          },
+	{  5,  1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium 1 (0.8u)"        },
+	{  5,  2, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium 1 (0.35u)"       },
+	{  5,  3, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium OverDrive"       },
+	{  5,  4, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium 1 (0.35u)"       },
+	{  5,  7, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium 1 (0.35u)"       },
+	{  5,  8, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium MMX (0.25u)"     },
+	
+	/* Pentium 2 / 3 / M / Conroe / whatsnext - all P6 based. */
+	{  6, -1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown P6"              },
+	{  6,  0, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium Pro"             },
+	{  6,  1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium Pro"             },
+	{  6,  3, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium II (Klamath)"    },
+	{  6,  5, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium II (Deschutes)"  },
+	{  6,  5, -1, -1, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile Pentium II (Tonga)"},
+	{  6,  6, -1, -1, -1,   1,    -1,    -1, NC,0              ,     0, "Pentium II (Dixon)"      },
+	
+	{  6,  3, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-II Xeon (Klamath)"     },
+	{  6,  5, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-II Xeon (Drake)"       },
+	{  6,  6, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-II Xeon (Dixon)"       },
+		
+	{  6,  5, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-II Celeron (Covington)" },
+	{  6,  6, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-II Celeron (Mendocino)" },
+	
+	/* -------------------------------------------------- */
+	
+	{  6,  7, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium III (Katmai)"    },
+	{  6,  8, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium III (Coppermine)"},
+	{  6, 10, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium III (Coppermine)"},
+	{  6, 11, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium III (Tualatin)"  },
+	
+	{  6,  7, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-III Xeon (Tanner)"     },
+	{  6,  8, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-III Xeon (Cascades)"   },
+	{  6, 10, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-III Xeon (Cascades)"   },
+	{  6, 11, -1, -1, -1,   1,    -1,    -1, NC, XEON_         ,     0, "P-III Xeon (Tualatin)"   },
+	
+	{  6,  7, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-III Celeron (Katmai)"     },
+	{  6,  8, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-III Celeron (Coppermine)" },
+	{  6, 10, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-III Celeron (Coppermine)" },
+	{  6, 11, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-III Celeron (Tualatin)"   },
+	
+	/* Netburst based (Pentium 4 and later)
+	   classic P4s */
+	{ 15, -1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown Pentium 4"       },
+	{ 15, -1, -1, 15, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "Unknown P-4 Celeron"     },
+	{ 15, -1, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Unknown Xeon"            },
+	
+	{ 15,  0, -1, 15, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium 4 (Willamette)"  },
+	{ 15,  1, -1, 15, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium 4 (Willamette)"  },
+	{ 15,  2, -1, 15, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium 4 (Northwood)"   },
+	{ 15,  3, -1, 15, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium 4 (Prescott)"    },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium 4 (Prescott)"    },
+	{ 15,  6, -1, 15, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium 4 (Cedar Mill)"  },
+	{ 15,  0, -1, 15, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile P-4 (Willamette)" },
+	{ 15,  1, -1, 15, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile P-4 (Willamette)" },
+	{ 15,  2, -1, 15, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile P-4 (Northwood)"  },
+	{ 15,  3, -1, 15, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile P-4 (Prescott)"   },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile P-4 (Prescott)"   },
+	{ 15,  6, -1, 15, -1,   1,    -1,    -1, NC, MOBILE_|PENTIUM_,   0, "Mobile P-4 (Cedar Mill)" },
+	
+	/* server CPUs */
+	{ 15,  0, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon (Foster)"           },
+	{ 15,  1, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon (Foster)"           },
+	{ 15,  2, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon (Prestonia)"        },
+	{ 15,  2, -1, 15, -1,   1,    -1,    -1, NC, XEON_|_MP     ,     0, "Xeon (Gallatin)"         },
+	{ 15,  3, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon (Nocona)"           },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon (Nocona)"           },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, IRWIN, XEON_      ,     0, "Xeon (Irwindale)"        },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, NC, XEON_|_MP     ,     0, "Xeon (Cranford)"         },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, POTOMAC, XEON_    ,     0, "Xeon (Potomac)"          },
+	{ 15,  6, -1, 15, -1,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon (Dempsey)"          },
+	
+	/* Pentium Ds */
+	{ 15,  4,  4, 15, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium D (SmithField)"  },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, PENTIUM_D, 0      ,     0, "Pentium D (SmithField)"  },
+	{ 15,  4,  7, 15, -1,   1,    -1,    -1, NC, 0             ,     0, "Pentium D (SmithField)"  },
+	{ 15,  6, -1, 15, -1,   1,    -1,    -1, PENTIUM_D, 0      ,     0, "Pentium D (Presler)"     },
+
+	/* Celeron and Celeron Ds */
+	{ 15,  1, -1, 15, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-4 Celeron (Willamette)"   },
+	{ 15,  2, -1, 15, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-4 Celeron (Northwood)"    },
+	{ 15,  3, -1, 15, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-4 Celeron D (Prescott)"   },
+	{ 15,  4, -1, 15, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-4 Celeron D (Prescott)"   },
+	{ 15,  6, -1, 15, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "P-4 Celeron D (Cedar Mill)" },
+	
+	/* -------------------------------------------------- */
+	/* Intel Core microarchitecture - P6-based */
+	
+	{  6,  9, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown Pentium M"          },
+	{  6,  9, -1, -1, -1,   1,    -1,    -1, PENTIUM_M, 0      ,     0, "Unknown Pentium M"          },
+	{  6,  9, -1, -1, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium M (Banias)"         },
+	{  6,  9, -1, -1, -1,   1,    -1,    -1, PENTIUM_M, 0      ,     0, "Pentium M (Banias)"         },
+	{  6,  9, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "Celeron M"                  },
+	{  6, 13, -1, -1, -1,   1,    -1,    -1, NC, PENTIUM_      ,     0, "Pentium M (Dothan)"         },
+	{  6, 13, -1, -1, -1,   1,    -1,    -1, PENTIUM_M, 0      ,     0, "Pentium M (Dothan)"         },
+	{  6, 13, -1, -1, -1,   1,    -1,    -1, NC, CELERON_      ,     0, "Celeron M"                  },
+	
+	{  6, 12, -1, -1, -1,  -1,    -1,    -1, NC, ATOM_         ,     0, "Unknown Atom"               },
+	{  6, 12, -1, -1, -1,  -1,    -1,    -1, DIAMONDVILLE,ATOM_,     0, "Atom (Diamondville)"        },
+	{  6, 12, -1, -1, -1,  -1,    -1,    -1, SILVERTHORNE,ATOM_,     0, "Atom (Silverthorne)"        },
+	{  6, 12, -1, -1, -1,  -1,    -1,    -1, CEDARVIEW, ATOM_  ,     0, "Atom (Cedarview)"           },
+	{  6,  6, -1, -1, -1,  -1,    -1,    -1, CEDARVIEW, ATOM_  ,     0, "Atom (Cedarview)"           },
+	{  6, 12, -1, -1, -1,  -1,    -1,    -1, PINEVIEW, ATOM_   ,     0, "Atom (Pineview)"            },
+	
+	/* -------------------------------------------------- */
+	
+	{  6, 14, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown Yonah"             },
+	{  6, 14, -1, -1, -1,   1,    -1,    -1, CORE_SOLO, 0      ,     0, "Yonah (Core Solo)"         },
+	{  6, 14, -1, -1, -1,   2,    -1,    -1, CORE_DUO, 0       ,     0, "Yonah (Core Duo)"          },
+	{  6, 14, -1, -1, -1,   1,    -1,    -1, CORE_SOLO, MOBILE_,     0, "Yonah (Core Solo)"         },
+	{  6, 14, -1, -1, -1,   2,    -1,    -1, CORE_DUO , MOBILE_,     0, "Yonah (Core Duo)"          },
+	{  6, 14, -1, -1, -1,   1,    -1,    -1, CORE_SOLO, 0      ,     0, "Yonah (Core Solo)"         },
+	
+	{  6, 15, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Unknown Core 2"            },
+	{  6, 15, -1, -1, -1,   2,  4096,    -1, CORE_DUO, 0       ,     0, "Conroe (Core 2 Duo)"       },
+	{  6, 15, -1, -1, -1,   2,  1024,    -1, CORE_DUO, 0       ,     0, "Conroe (Core 2 Duo) 1024K" },
+	{  6, 15, -1, -1, -1,   2,   512,    -1, CORE_DUO, 0       ,     0, "Conroe (Core 2 Duo) 512K"  },
+	{  6, 15, -1, -1, -1,   4,    -1,    -1, QUAD_CORE, 0      ,     0, "Kentsfield (Core 2 Quad)"  },
+	{  6, 15, -1, -1, -1,   4,  4096,    -1, QUAD_CORE, 0      ,     0, "Kentsfield (Core 2 Quad)"  },
+	{  6, 15, -1, -1, -1, 400,    -1,    -1, MORE_THAN_QUADCORE, 0,  0, "More than quad-core"       },
+	{  6, 15, -1, -1, -1,   2,  2048,    -1, CORE_DUO, 0       ,     0, "Allendale (Core 2 Duo)"    },
+	{  6, 15, -1, -1, -1,   2,    -1,    -1, MOBILE_CORE_DUO, 0,     0, "Merom (Core 2 Duo)"        },
+	{  6, 15, -1, -1, -1,   2,  2048,    -1, MEROM, 0          ,     0, "Merom (Core 2 Duo) 2048K"  },
+	{  6, 15, -1, -1, -1,   2,  4096,    -1, MEROM, 0          ,     0, "Merom (Core 2 Duo) 4096K"  },
+	
+	{  6, 15, -1, -1, 15,   1,    -1,    -1, NC, CELERON_      ,     0, "Conroe-L (Celeron)"        },
+	{  6,  6, -1, -1, 22,   1,    -1,    -1, NC, CELERON_      ,     0, "Conroe-L (Celeron)"        },
+	{  6, 15, -1, -1, 15,   2,    -1,    -1, NC, CELERON_      ,     0, "Conroe-L (Allendale)"      },
+	{  6,  6, -1, -1, 22,   2,    -1,    -1, NC, CELERON_      ,     0, "Conroe-L (Allendale)"      },
+	
+	
+	{  6,  6, -1, -1, 22,   1,    -1,    -1, NC, 0             ,     0, "Unknown Core ?"           },
+	{  6,  7, -1, -1, 23,   1,    -1,    -1, NC, 0             ,     0, "Unknown Core ?"           },
+	{  6,  6, -1, -1, 22, 400,    -1,    -1, MORE_THAN_QUADCORE, 0,  0, "More than quad-core"      },
+	{  6,  7, -1, -1, 23, 400,    -1,    -1, MORE_THAN_QUADCORE, 0,  0, "More than quad-core"      },
+	
+	{  6,  7, -1, -1, 23,   1,    -1,    -1, CORE_SOLO         , 0,  0, "Unknown Core 45nm"        },
+	{  6,  7, -1, -1, 23,   1,    -1,    -1, CORE_DUO          , 0,  0, "Unknown Core 45nm"        },
+	{  6,  7, -1, -1, 23,   2,  1024,    -1, WOLFDALE          , 0,  0, "Celeron Wolfdale 1M"      },
+	{  6,  7, -1, -1, 23,   2,  2048,    -1, WOLFDALE          , 0,  0, "Wolfdale (Core 2 Duo) 2M" },
+	{  6,  7, -1, -1, 23,   2,  3072,    -1, WOLFDALE          , 0,  0, "Wolfdale (Core 2 Duo) 3M" },
+	{  6,  7, -1, -1, 23,   2,  6144,    -1, WOLFDALE          , 0,  0, "Wolfdale (Core 2 Duo) 6M" },
+	{  6,  7, -1, -1, 23,   1,    -1,    -1, MOBILE_CORE_DUO   , 0,  0, "Penryn (Core 2 Duo)"      },
+	{  6,  7, -1, -1, 23,   2,  1024,    -1, PENRYN            , 0,  0, "Penryn (Core 2 Duo)"      },
+	{  6,  7, -1, -1, 23,   2,  3072,    -1, PENRYN            , 0,  0, "Penryn (Core 2 Duo) 3M"   },
+	{  6,  7, -1, -1, 23,   2,  6144,    -1, PENRYN            , 0,  0, "Penryn (Core 2 Duo) 6M"   },
+	{  6,  7, -1, -1, 23,   4,  2048,    -1, NC                , 0,  0, "Yorkfield (Core 2 Quad) 2M"},
+	{  6,  7, -1, -1, 23,   4,  3072,    -1, NC                , 0,  0, "Yorkfield (Core 2 Quad) 3M"},
+	{  6,  7, -1, -1, 23,   4,  6144,    -1, NC                , 0,  0, "Yorkfield (Core 2 Quad) 6M"},
+	
+	/* Core microarchitecture-based Xeons: */
+	{  6, 14, -1, -1, 14,   1,    -1,    -1, NC, XEON_         ,     0, "Xeon LV"                  },
+	{  6, 15, -1, -1, 15,   2,  4096,    -1, NC, XEON_         , _5100, "Xeon (Woodcrest)"         },
+	{  6, 15, -1, -1, 15,   2,  2048,    -1, NC, XEON_         , _3000, "Xeon (Conroe/2M)"         },
+	{  6, 15, -1, -1, 15,   2,  4096,    -1, NC, XEON_         , _3000, "Xeon (Conroe/4M)"         },
+	{  6, 15, -1, -1, 15,   4,  4096,    -1, NC, XEON_         , X3200, "Xeon (Kentsfield)"        },
+	{  6, 15, -1, -1, 15,   4,  4096,    -1, NC, XEON_         , _5300, "Xeon (Clovertown)"        },
+	{  6,  7, -1, -1, 23,   2,  6144,    -1, NC, XEON_         , _3100, "Xeon (Wolfdale)"          },
+	{  6,  7, -1, -1, 23,   2,  6144,    -1, NC, XEON_         , _5200, "Xeon (Wolfdale DP)"       },
+	{  6,  7, -1, -1, 23,   4,  6144,    -1, NC, XEON_         , _5400, "Xeon (Harpertown)"        },
+	{  6,  7, -1, -1, 23,   4,  3072,    -1, NC, XEON_         , X3300, "Xeon (Yorkfield/3M)"      },
+	{  6,  7, -1, -1, 23,   4,  6144,    -1, NC, XEON_         , X3300, "Xeon (Yorkfield/6M)"      },
+
+	/* Nehalem CPUs (45nm): */
+	{  6, 10, -1, -1, 26,   4,    -1,    -1, GAINESTOWN, XEON_ ,     0, "Gainestown (Xeon)"        },
+	{  6, 10, -1, -1, 26,   4,    -1,  4096, GAINESTOWN, XEON_ ,     0, "Gainestown 4M (Xeon)"     },
+	{  6, 10, -1, -1, 26,   4,    -1,  8192, GAINESTOWN, XEON_ ,     0, "Gainestown 8M (Xeon)"     },
+	{  6, 10, -1, -1, 26,   4,    -1,    -1, NC, XEON_|_7      ,     0, "Bloomfield (Xeon)"        },
+	{  6, 10, -1, -1, 26,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Bloomfield (Core i7)"     },
+	{  6, 10, -1, -1, 30,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Lynnfield (Core i7)"      },
+	{  6,  5, -1, -1, 37,   4,    -1,  8192, NC, CORE_|_I_|_5  ,     0, "Lynnfield (Core i5)"      },
+
+	/* Westmere CPUs (32nm): */
+	{  6,  5, -1, -1, 37,   2,    -1,    -1, NC, 0             ,     0, "Unknown Core i3/i5"       },
+	{  6, 12, -1, -1, 44,  -1,    -1,    -1, WESTMERE, XEON_   ,     0, "Westmere (Xeon)"          },
+	{  6, 12, -1, -1, 44,  -1,    -1, 12288, WESTMERE, XEON_   ,     0, "Gulftown (Xeon)"          },
+	{  6, 12, -1, -1, 44,   4,    -1, 12288, NC, CORE_|_I_|_7  ,     0, "Gulftown (Core i7)"       },
+	{  6,  5, -1, -1, 37,   2,    -1,  4096, NC, CORE_|_I_|_5  ,     0, "Clarkdale (Core i5)"      },
+	{  6,  5, -1, -1, 37,   2,    -1,  4096, NC, CORE_|_I_|_3  ,     0, "Clarkdale (Core i3)"      },
+	{  6,  5, -1, -1, 37,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Arrandale"                },
+	{  6,  5, -1, -1, 37,   2,    -1,  4096, NC, CORE_|_I_|_7  ,     0, "Arrandale (Core i7)"      },
+	{  6,  5, -1, -1, 37,   2,    -1,  3072, NC, CORE_|_I_|_5  ,     0, "Arrandale (Core i5)"      },
+	{  6,  5, -1, -1, 37,   2,    -1,  3072, NC, CORE_|_I_|_3  ,     0, "Arrandale (Core i3)"      },
+
+	/* Sandy Bridge CPUs (32nm): */
+	{  6, 10, -1, -1, 42,  -1,    -1,    -1, NC, 0             ,     0, "Unknown Sandy Bridge"     },
+	{  6, 10, -1, -1, 42,  -1,    -1,    -1, NC, XEON_         ,     0, "Sandy Bridge (Xeon)"      },
+	{  6, 10, -1, -1, 42,  -1,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Sandy Bridge (Core i7)"   },
+	{  6, 10, -1, -1, 42,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Sandy Bridge (Core i7)"   },
+	{  6, 10, -1, -1, 42,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Sandy Bridge (Core i5)"   },
+	{  6, 10, -1, -1, 42,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Sandy Bridge (Core i3)"   },
+	{  6, 10, -1, -1, 42,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Sandy Bridge (Pentium)"   },
+	{  6, 10, -1, -1, 42,   1,    -1,    -1, NC, CELERON_      ,     0, "Sandy Bridge (Celeron)"   },
+	{  6, 10, -1, -1, 42,   2,    -1,    -1, NC, CELERON_      ,     0, "Sandy Bridge (Celeron)"   },
+	{  6, 13, -1, -1, 45,  -1,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Sandy Bridge-E"           },
+	{  6, 13, -1, -1, 45,  -1,    -1,    -1, NC, XEON_         ,     0, "Sandy Bridge-E (Xeon)"    },
+
+	/* Ivy Bridge CPUs (22nm): */
+	{  6, 10, -1, -1, 58,  -1,    -1,    -1, NC, XEON_         ,     0, "Ivy Bridge (Xeon)"        },
+	{  6, 10, -1, -1, 58,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Ivy Bridge (Core i7)"     },
+	{  6, 10, -1, -1, 58,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Ivy Bridge (Core i5)"     },
+	{  6, 10, -1, -1, 58,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Ivy Bridge (Core i3)"     },
+	{  6, 10, -1, -1, 58,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Ivy Bridge (Pentium)"     },
+	{  6, 10, -1, -1, 58,   1,    -1,    -1, NC, CELERON_      ,     0, "Ivy Bridge (Celeron)"     },
+	{  6, 10, -1, -1, 58,   2,    -1,    -1, NC, CELERON_      ,     0, "Ivy Bridge (Celeron)"     },
+	{  6, 14, -1, -1, 62,  -1,    -1,    -1, NC, 0             ,     0, "Ivy Bridge-E"             },
+	
+	/* Haswell CPUs (22nm): */
+	{  6, 12, -1, -1, 60,  -1,    -1,    -1, NC, XEON_         ,     0, "Haswell (Xeon)"           },
+	{  6, 12, -1, -1, 60,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Haswell (Core i7)"        },
+	{  6,  5, -1, -1, 69,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Haswell (Core i7)"        },
+	{  6,  6, -1, -1, 70,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Haswell (Core i7)"        },
+	{  6, 12, -1, -1, 60,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Haswell (Core i5)"        },
+	{  6,  5, -1, -1, 69,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Haswell (Core i5)"        },
+	{  6, 12, -1, -1, 60,   2,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Haswell (Core i5)"        },
+	{  6,  5, -1, -1, 69,   2,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Haswell (Core i5)"        },
+	{  6, 12, -1, -1, 60,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Haswell (Core i3)"        },
+	{  6,  5, -1, -1, 69,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Haswell (Core i3)"        },
+	{  6, 12, -1, -1, 60,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Haswell (Pentium)"        },
+	{  6, 12, -1, -1, 60,   2,    -1,    -1, NC, CELERON_      ,     0, "Haswell (Celeron)"        },
+	{  6, 12, -1, -1, 60,   1,    -1,    -1, NC, CELERON_      ,     0, "Haswell (Celeron)"        },
+	{  6, 15, -1, -1, 63,  -1,    -1,    -1, NC, 0             ,     0, "Haswell-E"                },
+
+	/* Broadwell CPUs (14nm): */
+	{  6,  7, -1, -1, 71,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Broadwell (Core i7)"      },
+	{  6,  7, -1, -1, 71,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Broadwell (Core i5)"      },
+	{  6, 13, -1, -1, 61,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Broadwell-U (Core i7)"    },
+	{  6, 13, -1, -1, 61,   2,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Broadwell-U (Core i7)"    },
+	{  6, 13, -1, -1, 61,   2,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Broadwell-U (Core i5)"    },
+	{  6, 13, -1, -1, 61,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Broadwell-U (Core i3)"    },
+	{  6, 13, -1, -1, 61,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Broadwell-U (Pentium)"    },
+	{  6, 13, -1, -1, 61,   2,    -1,    -1, NC, CELERON_      ,     0, "Broadwell-U (Celeron)"    },
+	{  6, 13, -1, -1, 61,   2,    -1,    -1, NA, 0             ,     0, "Broadwell-U (Core M)"     },
+	{  6, 15, -1, -1, 79,  -1,    -1,    -1, NC, XEON_         ,     0, "Broadwell-E (Xeon)"       },
+	{  6, 15, -1, -1, 79,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Broadwell-E (Core i3)"    },
+	{  6, 15, -1, -1, 79,   2,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Broadwell-E (Core i5)"    },
+	{  6, 15, -1, -1, 79,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Broadwell-E (Core i5)"    },
+	{  6, 15, -1, -1, 79,   2,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Broadwell-E (Core i7)"    },
+	{  6, 15, -1, -1, 79,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Broadwell-E (Core i7)"    },
+
+	/* Skylake CPUs (14nm): */
+	{  6, 14, -1, -1, 94,  -1,    -1,    -1, NC, XEON_         ,     0, "Skylake (Xeon)"           },
+	{  6, 14, -1, -1, 94,   4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Skylake (Core i7)"        },
+	{  6, 14, -1, -1, 94,   4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Skylake (Core i5)"        },
+	{  6, 14, -1, -1, 94,   2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Skylake (Core i3)"        },
+	{  6, 14, -1, -1, 94,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Skylake (Pentium)"        },
+	{  6, 14, -1, -1, 78,   2,    -1,    -1, NC, PENTIUM_      ,     0, "Skylake (Pentium)"        },
+	{  6, 14, -1, -1, 94,   2,    -1,    -1, NC, CELERON_      ,     0, "Skylake (Celeron)"        },
+	{  6, 14, -1, -1, 78,   2,    -1,    -1, NC, CELERON_      ,     0, "Skylake (Celeron)"        },
+	{  6, 14, -1, -1, 78,   2,    -1,    -1, NC, CORE_|_M_|_7  ,     0, "Skylake (Core m7)"        },
+	{  6, 14, -1, -1, 78,   2,    -1,    -1, NC, CORE_|_M_|_5  ,     0, "Skylake (Core m5)"        },
+	{  6, 14, -1, -1, 78,   2,    -1,    -1, NC, CORE_|_M_|_3  ,     0, "Skylake (Core m3)"        },
+
+	/* Kaby Lake CPUs (14nm): */
+	{  6, 14, -1, -1, 158,  4,    -1,    -1, NC, CORE_|_I_|_7  ,     0, "Kaby Lake (Core i7)"      },
+	{  6, 14, -1, -1, 158,  4,    -1,    -1, NC, CORE_|_I_|_5  ,     0, "Kaby Lake (Core i5)"      },
+	{  6, 14, -1, -1, 158,  2,    -1,    -1, NC, CORE_|_I_|_3  ,     0, "Kaby Lake (Core i3)"      },
+	{  6, 14, -1, -1, 158,  2,    -1,    -1, NC, PENTIUM_      ,     0, "Kaby Lake (Pentium)"      },
+	{  6, 14, -1, -1, 158,  2,    -1,    -1, NC, CELERON_      ,     0, "Kaby Lake (Celeron)"      },
+	{  6, 14, -1, -1, 158,  2,    -1,    -1, NC, CORE_|_M_|_3  ,     0, "Kaby Lake (Core m3)"      },
+
+	/* Itaniums */
+	{  7, -1, -1, -1, -1,   1,    -1,    -1, NC, 0             ,     0, "Itanium"                  },
+	{ 15, -1, -1, 16, -1,   1,    -1,    -1, NC, 0             ,     0, "Itanium 2"                },
+};
+
+
+static void load_intel_features(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	const struct feature_map_t matchtable_edx1[] = {
+		{ 18, CPU_FEATURE_PN },
+		{ 21, CPU_FEATURE_DTS },
+		{ 22, CPU_FEATURE_ACPI },
+		{ 27, CPU_FEATURE_SS },
+		{ 29, CPU_FEATURE_TM },
+		{ 30, CPU_FEATURE_IA64 },
+		{ 31, CPU_FEATURE_PBE },
+	};
+	const struct feature_map_t matchtable_ecx1[] = {
+		{  2, CPU_FEATURE_DTS64 },
+		{  4, CPU_FEATURE_DS_CPL },
+		{  5, CPU_FEATURE_VMX },
+		{  6, CPU_FEATURE_SMX },
+		{  7, CPU_FEATURE_EST },
+		{  8, CPU_FEATURE_TM2 },
+		{ 10, CPU_FEATURE_CID },
+		{ 14, CPU_FEATURE_XTPR },
+		{ 15, CPU_FEATURE_PDCM },
+		{ 18, CPU_FEATURE_DCA },
+		{ 21, CPU_FEATURE_X2APIC },
+	};
+	const struct feature_map_t matchtable_edx81[] = {
+		{ 20, CPU_FEATURE_XD },
+	};
+	const struct feature_map_t matchtable_ebx7[] = {
+		{  2, CPU_FEATURE_SGX },
+		{  4, CPU_FEATURE_HLE },
+		{ 11, CPU_FEATURE_RTM },
+		{ 16, CPU_FEATURE_AVX512F },
+		{ 17, CPU_FEATURE_AVX512DQ },
+		{ 18, CPU_FEATURE_RDSEED },
+		{ 19, CPU_FEATURE_ADX },
+		{ 26, CPU_FEATURE_AVX512PF },
+		{ 27, CPU_FEATURE_AVX512ER },
+		{ 28, CPU_FEATURE_AVX512CD },
+		{ 29, CPU_FEATURE_SHA_NI },
+		{ 30, CPU_FEATURE_AVX512BW },
+		{ 31, CPU_FEATURE_AVX512VL },
+	};
+	if (raw->basic_cpuid[0][0] >= 1) {
+		match_features(matchtable_edx1, COUNT_OF(matchtable_edx1), raw->basic_cpuid[1][3], data);
+		match_features(matchtable_ecx1, COUNT_OF(matchtable_ecx1), raw->basic_cpuid[1][2], data);
+	}
+	if (raw->ext_cpuid[0][0] >= 1) {
+		match_features(matchtable_edx81, COUNT_OF(matchtable_edx81), raw->ext_cpuid[1][3], data);
+	}
+	// detect TSX/AVX512:
+	if (raw->basic_cpuid[0][0] >= 7) {
+		match_features(matchtable_ebx7, COUNT_OF(matchtable_ebx7), raw->basic_cpuid[7][1], data);
+	}
+}
+
+enum _cache_type_t {
+	L1I,
+	L1D,
+	L2,
+	L3,
+	L4
+};
+typedef enum _cache_type_t cache_type_t;
+
+static void check_case(uint8_t on, cache_type_t cache, int size, int assoc, int linesize, struct cpu_id_t* data)
+{
+	if (!on) return;
+	switch (cache) {
+		case L1I:
+			data->l1_instruction_cache = size;
+			break;
+		case L1D:
+			data->l1_data_cache = size;
+			data->l1_assoc = assoc;
+			data->l1_cacheline = linesize;
+			break;
+		case L2:
+			data->l2_cache = size;
+			data->l2_assoc = assoc;
+			data->l2_cacheline = linesize;
+			break;
+		case L3:
+			data->l3_cache = size;
+			data->l3_assoc = assoc;
+			data->l3_cacheline = linesize;
+			break;
+		case L4:
+			data->l4_cache = size;
+			data->l4_assoc = assoc;
+			data->l4_cacheline = linesize;
+			break;
+		default:
+			break;
+	}
+}
+
+static void decode_intel_oldstyle_cache_info(struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	uint8_t f[256] = {0};
+	int reg, off;
+	uint32_t x;
+	for (reg = 0; reg < 4; reg++) {
+		x = raw->basic_cpuid[2][reg];
+		if (x & 0x80000000) continue;
+		for (off = 0; off < 4; off++) {
+			f[x & 0xff] = 1;
+			x >>= 8;
+		}
+	}
+	
+	check_case(f[0x06], L1I,      8,  4,  32, data);
+	check_case(f[0x08], L1I,     16,  4,  32, data);
+	check_case(f[0x0A], L1D,      8,  2,  32, data);
+	check_case(f[0x0C], L1D,     16,  4,  32, data);
+	check_case(f[0x22],  L3,    512,  4,  64, data);
+	check_case(f[0x23],  L3,   1024,  8,  64, data);
+	check_case(f[0x25],  L3,   2048,  8,  64, data);
+	check_case(f[0x29],  L3,   4096,  8,  64, data);
+	check_case(f[0x2C], L1D,     32,  8,  64, data);
+	check_case(f[0x30], L1I,     32,  8,  64, data);
+	check_case(f[0x39],  L2,    128,  4,  64, data);
+	check_case(f[0x3A],  L2,    192,  6,  64, data);
+	check_case(f[0x3B],  L2,    128,  2,  64, data);
+	check_case(f[0x3C],  L2,    256,  4,  64, data);
+	check_case(f[0x3D],  L2,    384,  6,  64, data);
+	check_case(f[0x3E],  L2,    512,  4,  64, data);
+	check_case(f[0x41],  L2,    128,  4,  32, data);
+	check_case(f[0x42],  L2,    256,  4,  32, data);
+	check_case(f[0x43],  L2,    512,  4,  32, data);
+	check_case(f[0x44],  L2,   1024,  4,  32, data);
+	check_case(f[0x45],  L2,   2048,  4,  32, data);
+	check_case(f[0x46],  L3,   4096,  4,  64, data);
+	check_case(f[0x47],  L3,   8192,  8,  64, data);
+	check_case(f[0x4A],  L3,   6144, 12,  64, data);
+	check_case(f[0x4B],  L3,   8192, 16,  64, data);
+	check_case(f[0x4C],  L3,  12288, 12,  64, data);
+	check_case(f[0x4D],  L3,  16384, 16,  64, data);
+	check_case(f[0x4E],  L2,   6144, 24,  64, data);
+	check_case(f[0x60], L1D,     16,  8,  64, data);
+	check_case(f[0x66], L1D,      8,  4,  64, data);
+	check_case(f[0x67], L1D,     16,  4,  64, data);
+	check_case(f[0x68], L1D,     32,  4,  64, data);
+	/* The following four entries are trace cache. Intel does not
+	 * specify a cache-line size, so we use -1 instead
+	 */
+	check_case(f[0x70], L1I,     12,  8,  -1, data);
+	check_case(f[0x71], L1I,     16,  8,  -1, data);
+	check_case(f[0x72], L1I,     32,  8,  -1, data);
+	check_case(f[0x73], L1I,     64,  8,  -1, data);
+	
+	check_case(f[0x78],  L2,   1024,  4,  64, data);
+	check_case(f[0x79],  L2,    128,  8,  64, data);
+	check_case(f[0x7A],  L2,    256,  8,  64, data);
+	check_case(f[0x7B],  L2,    512,  8,  64, data);
+	check_case(f[0x7C],  L2,   1024,  8,  64, data);
+	check_case(f[0x7D],  L2,   2048,  8,  64, data);
+	check_case(f[0x7F],  L2,    512,  2,  64, data);
+	check_case(f[0x82],  L2,    256,  8,  32, data);
+	check_case(f[0x83],  L2,    512,  8,  32, data);
+	check_case(f[0x84],  L2,   1024,  8,  32, data);
+	check_case(f[0x85],  L2,   2048,  8,  32, data);
+	check_case(f[0x86],  L2,    512,  4,  64, data);
+	check_case(f[0x87],  L2,   1024,  8,  64, data);
+	
+	if (f[0x49]) {
+		/* This flag is overloaded with two meanings. On Xeon MP
+		 * (family 0xf, model 0x6) this means L3 cache. On all other
+		 * CPUs (notably Conroe et al), this is L2 cache. In both cases
+		 * it means 4MB, 16-way associative, 64-byte line size.
+		 */
+		if (data->family == 0xf && data->model == 0x6) {
+			data->l3_cache = 4096;
+			data->l3_assoc = 16;
+			data->l3_cacheline = 64;
+		} else {
+			data->l2_cache = 4096;
+			data->l2_assoc = 16;
+			data->l2_cacheline = 64;
+		}
+	}
+	if (f[0x40]) {
+		/* Again, a special flag. It means:
+		 * 1) If no L2 is specified, then CPU is w/o L2 (0 KB)
+		 * 2) If L2 is specified by other flags, then, CPU is w/o L3.
+		 */
+		if (data->l2_cache == -1) {
+			data->l2_cache = 0;
+		} else {
+			data->l3_cache = 0;
+		}
+	}
+}
+
+static void decode_intel_deterministic_cache_info(struct cpu_raw_data_t* raw,
+                                                  struct cpu_id_t* data)
+{
+	int ecx;
+	int ways, partitions, linesize, sets, size, level, typenumber;
+	cache_type_t type;
+	for (ecx = 0; ecx < MAX_INTELFN4_LEVEL; ecx++) {
+		typenumber = raw->intel_fn4[ecx][0] & 0x1f;
+		if (typenumber == 0) break;
+		level = (raw->intel_fn4[ecx][0] >> 5) & 0x7;
+		if (level == 1 && typenumber == 1)
+			type = L1D;
+		else if (level == 1 && typenumber == 2)
+			type = L1I;
+		else if (level == 2 && typenumber == 3)
+			type = L2;
+		else if (level == 3 && typenumber == 3)
+			type = L3;
+		else if (level == 4 && typenumber == 3)
+			type = L4;
+		else {
+			warnf("deterministic_cache: unknown level/typenumber combo (%d/%d), cannot\n", level, typenumber);
+			warnf("deterministic_cache: recognize cache type\n");
+			continue;
+		}
+		ways = ((raw->intel_fn4[ecx][1] >> 22) & 0x3ff) + 1;
+		partitions = ((raw->intel_fn4[ecx][1] >> 12) & 0x3ff) + 1;
+		linesize = (raw->intel_fn4[ecx][1] & 0xfff) + 1;
+		sets = raw->intel_fn4[ecx][2] + 1;
+		size = ways * partitions * linesize * sets / 1024;
+		check_case(1, type, size, ways, linesize, data);
+	}
+}
+
+static int decode_intel_extended_topology(struct cpu_raw_data_t* raw,
+                                           struct cpu_id_t* data)
+{
+	int i, level_type, num_smt = -1, num_core = -1;
+	for (i = 0; i < MAX_INTELFN11_LEVEL; i++) {
+		level_type = (raw->intel_fn11[i][2] & 0xff00) >> 8;
+		switch (level_type) {
+			case 0x01:
+				num_smt = raw->intel_fn11[i][1] & 0xffff;
+				break;
+			case 0x02:
+				num_core = raw->intel_fn11[i][1] & 0xffff;
+				break;
+			default:
+				break;
+		}
+	}
+	if (num_smt == -1 || num_core == -1) return 0;
+	data->num_logical_cpus = num_core;
+	data->num_cores = num_core / num_smt;
+	// make sure num_cores is at least 1. In VMs, the CPUID instruction
+	// is rigged and may give nonsensical results, but we should at least
+	// avoid outputs like data->num_cores == 0.
+	if (data->num_cores <= 0) data->num_cores = 1;
+	return 1;
+}
+
+static void decode_intel_number_of_cores(struct cpu_raw_data_t* raw,
+                                         struct cpu_id_t* data)
+{
+	int logical_cpus = -1, num_cores = -1;
+	
+	if (raw->basic_cpuid[0][0] >= 11) {
+		if (decode_intel_extended_topology(raw, data)) return;
+	}
+	
+	if (raw->basic_cpuid[0][0] >= 1) {
+		logical_cpus = (raw->basic_cpuid[1][1] >> 16) & 0xff;
+		if (raw->basic_cpuid[0][0] >= 4) {
+			num_cores = 1 + ((raw->basic_cpuid[4][0] >> 26) & 0x3f);
+		}
+	}
+	if (data->flags[CPU_FEATURE_HT]) {
+		if (num_cores > 1) {
+			data->num_cores = num_cores;
+			data->num_logical_cpus = logical_cpus;
+		} else {
+			data->num_cores = 1;
+			data->num_logical_cpus = (logical_cpus >= 1 ? logical_cpus : 1);
+			if (data->num_logical_cpus == 1)
+				data->flags[CPU_FEATURE_HT] = 0;
+		}
+	} else {
+		data->num_cores = data->num_logical_cpus = 1;
+	}
+}
+
+static intel_code_and_bits_t get_brand_code_and_bits(struct cpu_id_t* data)
+{
+	intel_code_t code = (intel_code_t) NC;
+	intel_code_and_bits_t result;
+	uint64_t bits = 0;
+	int i = 0;
+	const char* bs = data->brand_str;
+	const char* s;
+	const struct { intel_code_t c; const char *search; } matchtable[] = {
+		{ PENTIUM_M, "Pentium(R) M" },
+		{ CORE_SOLO, "Pentium(R) Dual  CPU" },
+		{ CORE_SOLO, "Pentium(R) Dual-Core" },
+		{ PENTIUM_D, "Pentium(R) D" },
+		{ CORE_SOLO, "Genuine Intel(R) CPU" },
+		{ CORE_SOLO, "Intel(R) Core(TM)" },
+		{ DIAMONDVILLE, "CPU [N ][23]## " },
+		{ SILVERTHORNE, "CPU Z" },
+		{ PINEVIEW, "CPU [ND][45]## " },
+		{ CEDARVIEW, "CPU [ND]#### " },
+	};
+	
+	const struct { uint64_t bit; const char* search; } bit_matchtable[] = {
+		{ XEON_, "Xeon" },
+		{ _MP, " MP" },
+		{ ATOM_, "Atom(TM) CPU" },
+		{ MOBILE_, "Mobile" },
+		{ CELERON_, "Celeron" },
+		{ PENTIUM_, "Pentium" },
+	};
+	
+	for (i = 0; i < COUNT_OF(bit_matchtable); i++) {
+		if (match_pattern(bs, bit_matchtable[i].search))
+			bits |= bit_matchtable[i].bit;
+	}
+	
+	if ((i = match_pattern(bs, "Core(TM) [im][357]")) != 0) {
+		bits |= CORE_;
+		i--;
+		switch (bs[i + 9]) {
+			case 'i': bits |= _I_; break;
+			case 'm': bits |= _M_; break;
+		}
+		switch (bs[i + 10]) {
+			case '3': bits |= _3; break;
+			case '5': bits |= _5; break;
+			case '7': bits |= _7; break;
+		}
+	}
+	for (i = 0; i < COUNT_OF(matchtable); i++)
+		if (match_pattern(bs, matchtable[i].search)) {
+			code = matchtable[i].c;
+			break;
+		}
+	debugf(2, "intel matchtable result is %d\n", code);
+	if (bits & XEON_) {
+		if (match_pattern(bs, "W35##") || match_pattern(bs, "[ELXW]75##"))
+			bits |= _7;
+		else if (match_pattern(bs, "[ELXW]55##"))
+			code = GAINESTOWN;
+		else if (match_pattern(bs, "[ELXW]56##"))
+			code = WESTMERE;
+		else if (data->l3_cache > 0 && data->family == 16)
+			/* restrict by family, since later Xeons also have L3 ... */
+			code = IRWIN;
+	}
+	if (match_all(bits, XEON_ + _MP) && data->l3_cache > 0)
+		code = POTOMAC;
+	if (code == CORE_SOLO) {
+		s = strstr(bs, "CPU");
+		if (s) {
+			s += 3;
+			while (*s == ' ') s++;
+			if (*s == 'T')
+				bits |= MOBILE_;
+		}
+	}
+	if (code == CORE_SOLO) {
+		switch (data->num_cores) {
+			case 1: break;
+			case 2:
+			{
+				code = CORE_DUO;
+				if (data->num_logical_cpus > 2)
+					code = DUAL_CORE_HT;
+				break;
+			}
+			case 4:
+			{
+				code = QUAD_CORE;
+				if (data->num_logical_cpus > 4)
+					code = QUAD_CORE_HT;
+				break;
+			}
+			default:
+				code = MORE_THAN_QUADCORE; break;
+		}
+	}
+	
+	if (code == CORE_DUO && (bits & MOBILE_) && data->model != 14) {
+		if (data->ext_model < 23) {
+			code = MEROM;
+		} else {
+			code = PENRYN;
+		}
+	}
+	if (data->ext_model == 23 &&
+		(code == CORE_DUO || code == PENTIUM_D || (bits & CELERON_))) {
+		code = WOLFDALE;
+	}
+
+	result.code = code;
+	result.bits = bits;
+	return result;
+}
+
+static intel_model_t get_model_code(struct cpu_id_t* data)
+{
+	int i = 0;
+	int l = (int) strlen(data->brand_str);
+	const char *bs = data->brand_str;
+	int mod_flags = 0, model_no = 0, ndigs = 0;
+	/* If the CPU is a Core ix, then just return the model number generation: */
+	if ((i = match_pattern(bs, "Core(TM) i[357]")) != 0) {
+		i += 11;
+		if (i + 4 >= l) return UNKNOWN;
+		if (bs[i] == '2') return _2xxx;
+		if (bs[i] == '3') return _3xxx;
+		return UNKNOWN;
+	}
+	
+	/* For Core2-based Xeons: */
+	while (i < l - 3) {
+		if (bs[i] == 'C' && bs[i+1] == 'P' && bs[i+2] == 'U')
+			break;
+		i++;
+	}
+	if (i >= l - 3) return UNKNOWN;
+	i += 3;
+	while (i < l - 4 && bs[i] == ' ') i++;
+	if (i >= l - 4) return UNKNOWN;
+	while (i < l - 4 && !isdigit(bs[i])) {
+		if (bs[i] >= 'A' && bs[i] <= 'Z')
+			mod_flags |= (1 << (bs[i] - 'A'));
+		i++;
+	}
+	if (i >= l - 4) return UNKNOWN;
+	while (isdigit(bs[i])) {
+		ndigs++;
+		model_no = model_no * 10 + (int) (bs[i] - '0');
+		i++;
+	}
+	if (ndigs != 4) return UNKNOWN;
+#define HAVE(ch, flags) ((flags & (1 << ((int)(ch-'A')))) != 0)
+	switch (model_no / 100) {
+		case 30: return _3000;
+		case 31: return _3100;
+		case 32:
+		{
+			return (HAVE('X', mod_flags)) ? X3200 : _3200;
+		}
+		case 33:
+		{
+			return (HAVE('X', mod_flags)) ? X3300 : _3300;
+		}
+		case 51: return _5100;
+		case 52: return _5200;
+		case 53: return _5300;
+		case 54: return _5400;
+		default:
+			return UNKNOWN;
+	}
+#undef HAVE
+}
+
+static void decode_intel_sgx_features(const struct cpu_raw_data_t* raw, struct cpu_id_t* data)
+{
+	struct cpu_epc_t epc;
+	int i;
+	
+	if (raw->basic_cpuid[0][0] < 0x12) return; // no 12h leaf
+	if (raw->basic_cpuid[0x12][0] == 0) return; // no sub-leafs available, probably it's disabled by BIOS
+	
+	// decode sub-leaf 0:
+	if (raw->basic_cpuid[0x12][0] & 1) data->sgx.flags[INTEL_SGX1] = 1;
+	if (raw->basic_cpuid[0x12][0] & 2) data->sgx.flags[INTEL_SGX2] = 1;
+	if (data->sgx.flags[INTEL_SGX1] || data->sgx.flags[INTEL_SGX2])
+		data->sgx.present = 1;
+	data->sgx.misc_select = raw->basic_cpuid[0x12][1];
+	data->sgx.max_enclave_32bit = (raw->basic_cpuid[0x12][3]     ) & 0xff;
+	data->sgx.max_enclave_64bit = (raw->basic_cpuid[0x12][3] >> 8) & 0xff;
+	
+	// decode sub-leaf 1:
+	data->sgx.secs_attributes = raw->intel_fn12h[1][0] | (((uint64_t) raw->intel_fn12h[1][1]) << 32);
+	data->sgx.secs_xfrm       = raw->intel_fn12h[1][2] | (((uint64_t) raw->intel_fn12h[1][3]) << 32);
+	
+	// decode higher-order subleafs, whenever present:
+	data->sgx.num_epc_sections = -1;
+	for (i = 0; i < 1000000; i++) {
+		epc = cpuid_get_epc(i, raw);
+		if (epc.length == 0) {
+			debugf(2, "SGX: epc section request for %d returned null, no more EPC sections.\n", i);
+			data->sgx.num_epc_sections = i;
+			break;
+		}
+	}
+	if (data->sgx.num_epc_sections == -1) {
+		debugf(1, "SGX: warning: seems to be infinitude of EPC sections.\n");
+		data->sgx.num_epc_sections = 1000000;
+	}
+}
+
+struct cpu_epc_t cpuid_get_epc(int index, const struct cpu_raw_data_t* raw)
+{
+	uint32_t regs[4];
+	struct cpu_epc_t retval = {0, 0};
+	if (raw && index < MAX_INTELFN12H_LEVEL - 2) {
+		// this was queried already, use the data:
+		memcpy(regs, raw->intel_fn12h[2 + index], sizeof(regs));
+	} else {
+		// query this ourselves:
+		regs[0] = 0x12;
+		regs[2] = 2 + index;
+		regs[1] = regs[3] = 0;
+		cpu_exec_cpuid_ext(regs);
+	}
+	
+	// decode values:
+	if ((regs[0] & 0xf) == 0x1) {
+		retval.start_addr |= (regs[0] & 0xfffff000); // bits [12, 32) -> bits [12, 32)
+		retval.start_addr |= ((uint64_t) (regs[1] & 0x000fffff)) << 32; // bits [0, 20) -> bits [32, 52)
+		retval.length     |= (regs[2] & 0xfffff000); // bits [12, 32) -> bits [12, 32)
+		retval.length     |= ((uint64_t) (regs[3] & 0x000fffff)) << 32; // bits [0, 20) -> bits [32, 52)
+	}
+	return retval;
+}
+
+int cpuid_identify_intel(struct cpu_raw_data_t* raw, struct cpu_id_t* data, struct internal_id_info_t* internal)
+{
+	intel_code_and_bits_t brand;
+	intel_model_t model_code;
+	int i;
+	char* brand_code_str = NULL;
+
+	load_intel_features(raw, data);
+	if (raw->basic_cpuid[0][0] >= 4) {
+		/* Deterministic way is preferred, being more generic */
+		decode_intel_deterministic_cache_info(raw, data);
+	} else if (raw->basic_cpuid[0][0] >= 2) {
+		decode_intel_oldstyle_cache_info(raw, data);
+	}
+	decode_intel_number_of_cores(raw, data);
+
+	brand = get_brand_code_and_bits(data);
+	model_code = get_model_code(data);
+	for (i = 0; i < COUNT_OF(intel_bcode_str); i++) {
+		if (brand.code == intel_bcode_str[i].code) {
+			brand_code_str = intel_bcode_str[i].str;
+			break;
+		}
+	}
+	if (brand_code_str)
+		debugf(2, "Detected Intel brand code: %d (%s)\n", brand.code, brand_code_str);
+	else
+		debugf(2, "Detected Intel brand code: %d\n", brand.code);
+	if (brand.bits) {
+		debugf(2, "Detected Intel bits: ");
+		debug_print_lbits(2, brand.bits);
+	}
+	debugf(2, "Detected Intel model code: %d\n", model_code);
+	
+	internal->code.intel = brand.code;
+	internal->bits = brand.bits;
+	
+	if (data->flags[CPU_FEATURE_SGX]) {
+		debugf(2, "SGX seems to be present, decoding...\n");
+		// if SGX is indicated by the CPU, verify its presence:
+		decode_intel_sgx_features(raw, data);
+	}
+
+	internal->score = match_cpu_codename(cpudb_intel, COUNT_OF(cpudb_intel), data,
+		brand.code, brand.bits, model_code);
+	return 0;
+}
+
+void cpuid_get_list_intel(struct cpu_list_t* list)
+{
+	generic_get_cpu_list(cpudb_intel, COUNT_OF(cpudb_intel), list);
+}
--- a/compat/libcpuid/recog_intel.h
+++ b/compat/libcpuid/recog_intel.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __RECOG_INTEL_H__
+#define __RECOG_INTEL_H__
+
+int cpuid_identify_intel(struct cpu_raw_data_t* raw, struct cpu_id_t* data, struct internal_id_info_t* internal);
+void cpuid_get_list_intel(struct cpu_list_t* list);
+
+#endif /*__RECOG_INTEL_H__*/
--- a/cpu.c
+++ b/cpu.c
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -24,77 +25,87 @@
 #include <cpuid.h>
 #include <string.h>
 #include <stdbool.h>
+#include <math.h>
+
+#ifndef BUILD_TEST
+#   include <libcpuid.h>
+#endif
+
 #include "cpu.h"
+#include "options.h"


-#define VENDOR_ID                  (0)
-#define PROCESSOR_INFO             (1)
-#define CACHE_TLB_DESCRIPTOR       (2)
-#define EXTENDED_FEATURES          (7)
-#define PROCESSOR_BRAND_STRING_1   (0x80000002)
-#define PROCESSOR_BRAND_STRING_2   (0x80000003)
-#define PROCESSOR_BRAND_STRING_3   (0x80000004)
-
-#define EAX_Reg  (0)
-#define EBX_Reg  (1)
-#define ECX_Reg  (2)
-#define EDX_Reg  (3)
-
-
-static inline void cpuid(int level, int output[4]) {
-    int a, b, c, d;
-    __cpuid_count(level, 0, a, b, c, d);
-
-    output[0] = a;
-    output[1] = b;
-    output[2] = c;
-    output[3] = d;
-}
-
-
-static void cpu_brand_string(char* s) {
-    int cpu_info[4] = { 0 };
-    cpuid(VENDOR_ID, cpu_info);
-
-    if (cpu_info[EAX_Reg] >= 4) {
-        for (int i = 0; i < 4; i++) {
-            cpuid(0x80000002 + i, cpu_info);
-            memcpy(s, cpu_info, sizeof(cpu_info));
-            s += 16;
-        }
-    }
-}
-
-
-static bool has_aes_ni()
-{
-    int cpu_info[4] = { 0 };
-    cpuid(PROCESSOR_INFO, cpu_info);
-
-    return cpu_info[ECX_Reg] & bit_AES;
-}
-
-
-static bool has_bmi2() {
-    int cpu_info[4] = { 0 };
-    cpuid(EXTENDED_FEATURES, cpu_info);
-
-    return cpu_info[EBX_Reg] & bit_BMI2;
-}
-
-
+#ifndef BUILD_TEST
 void cpu_init_common() {
-    cpu_brand_string(cpu_info.brand);
+    struct cpu_raw_data_t raw = { 0 };
+    struct cpu_id_t data = { 0 };
+
+    cpuid_get_raw_data(&raw);
+    cpu_identify(&raw, &data);
+
+    strncpy(cpu_info.brand, data.brand_str, sizeof(cpu_info.brand) - 1);
+
+    cpu_info.total_logical_cpus = data.total_logical_cpus;
+    cpu_info.sockets            = data.total_logical_cpus / data.num_logical_cpus;
+    cpu_info.total_cores        = data.num_cores * cpu_info.sockets;
+    cpu_info.l3_cache           = data.l3_cache > 0 ? data.l3_cache * cpu_info.sockets : 0;
+
+    // Workaround for AMD CPUs https://github.com/anrieff/libcpuid/issues/97
+    if (data.vendor == VENDOR_AMD && data.l3_cache <= 0 && data.l2_assoc == 16 && data.ext_family >= 21) {
+        cpu_info.l2_cache = data.l2_cache * (cpu_info.total_cores / 2) * cpu_info.sockets;
+    }
+    else {
+        cpu_info.l2_cache = data.l2_cache > 0 ? data.l2_cache * cpu_info.total_cores * cpu_info.sockets : 0;
+    }
+

 #   ifdef __x86_64__
    cpu_info.flags |= CPU_FLAG_X86_64;
 #   endif

-    if (has_aes_ni()) {
+    if (data.flags[CPU_FEATURE_AES]) {
        cpu_info.flags |= CPU_FLAG_AES;
    }

-    if (has_bmi2()) {
+    if (data.flags[CPU_FEATURE_BMI2]) {
        cpu_info.flags |= CPU_FLAG_BMI2;
    }
+
+#   ifndef XMRIG_NO_ASM
+    if (data.vendor == VENDOR_AMD) {
+        cpu_info.assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER;
+    }
+    else if (data.vendor == VENDOR_INTEL) {
+        cpu_info.assembly = ASM_INTEL;
+    }
+#   endif
+}
+#endif
+
+
+int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage) {
+    if (cpu_info.total_logical_cpus == 1) {
+        return 1;
+    }
+
+    int cache = cpu_info.l3_cache ? cpu_info.l3_cache : cpu_info.l2_cache;
+    int count = 0;
+    const int size = (algo ? 1024 : 2048) * (double_hash ? 2 : 1);
+
+    if (cache) {
+        count = cache / size;
+    }
+    else {
+        count = cpu_info.total_logical_cpus / 2;
+    }
+
+    if (count > cpu_info.total_logical_cpus) {
+        count = cpu_info.total_logical_cpus;
+    }
+
+    if (((float) count / cpu_info.total_logical_cpus * 100) > max_cpu_usage) {
+        count = ceil((float) cpu_info.total_logical_cpus * (max_cpu_usage / 100.0));
+    }
+
+    return count < 1 ? 1 : count;
 }
--- a/cpu.h
+++ b/cpu.h
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -21,13 +22,20 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CPU_H__
-#define __CPU_H__
+#ifndef XMRIG_CPU_H
+#define XMRIG_CPU_H
+
+#include <stdbool.h>

 struct cpu_info {
-    int count;
+    int total_cores;
+    int total_logical_cpus;
    int flags;
-    char brand[48];
+    int sockets;
+    int l2_cache;
+    int l3_cache;
+    char brand[64];
+    int assembly;
 };

 extern struct cpu_info cpu_info;
@@ -40,9 +48,8 @@ enum cpu_flags {
 };


-
 void cpu_init();
-int get_optimal_threads_count();
+int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage);
 int affine_to_cpu_mask(int id, unsigned long mask);

-#endif /* __CPU_H__ */
+#endif /* XMRIG_CPU_H */
--- a/cpu_stub.c
+++ b/cpu_stub.c
@@ -0,0 +1,129 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ *
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cpuid.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+
+#include "cpu.h"
+#include "options.h"
+
+
+#define VENDOR_ID                  (0)
+#define PROCESSOR_INFO             (1)
+#define CACHE_TLB_DESCRIPTOR       (2)
+#define EXTENDED_FEATURES          (7)
+#define PROCESSOR_BRAND_STRING_1   (0x80000002)
+#define PROCESSOR_BRAND_STRING_2   (0x80000003)
+#define PROCESSOR_BRAND_STRING_3   (0x80000004)
+
+#define EAX_Reg  (0)
+#define EBX_Reg  (1)
+#define ECX_Reg  (2)
+#define EDX_Reg  (3)
+
+
+static inline void cpuid(int level, int output[4]) {
+    int a, b, c, d;
+    __cpuid_count(level, 0, a, b, c, d);
+
+    output[0] = a;
+    output[1] = b;
+    output[2] = c;
+    output[3] = d;
+}
+
+
+static void cpu_brand_string(char* s) {
+    int32_t cpu_info[4] = { 0 };
+    cpuid(VENDOR_ID, cpu_info);
+
+    if (cpu_info[EAX_Reg] >= 4) {
+        for (int i = 0; i < 4; i++) {
+            cpuid(0x80000002 + i, cpu_info);
+            memcpy(s, cpu_info, sizeof(cpu_info));
+            s += 16;
+        }
+    }
+}
+
+
+static bool has_aes_ni()
+{
+    int32_t cpu_info[4] = { 0 };
+    cpuid(PROCESSOR_INFO, cpu_info);
+
+    return cpu_info[ECX_Reg] & bit_AES;
+}
+
+
+static bool has_bmi2() {
+    int32_t cpu_info[4] = { 0 };
+    cpuid(EXTENDED_FEATURES, cpu_info);
+
+    return cpu_info[EBX_Reg] & bit_BMI2;
+}
+
+
+void cpu_init_common() {
+    cpu_info.sockets = 1;
+    cpu_brand_string(cpu_info.brand);
+
+#   ifdef __x86_64__
+    cpu_info.flags |= CPU_FLAG_X86_64;
+#   endif
+
+    if (has_aes_ni()) {
+        cpu_info.flags |= CPU_FLAG_AES;
+
+#       ifndef XMRIG_NO_ASM
+        char vendor[13] = { 0 };
+        int32_t data[4] = { 0 };
+
+        cpuid(0, data);
+
+        memcpy(vendor + 0, &data[1], 4);
+        memcpy(vendor + 4, &data[3], 4);
+        memcpy(vendor + 8, &data[2], 4);
+
+        if (memcmp(vendor, "GenuineIntel", 12) == 0) {
+            cpu_info.assembly = ASM_INTEL;
+        }
+        else if (memcmp(vendor, "AuthenticAMD", 12) == 0) {
+            cpu_info.assembly = ASM_RYZEN;
+        }
+#       endif
+    }
+
+    if (has_bmi2()) {
+        cpu_info.flags |= CPU_FLAG_BMI2;
+    }
+}
+
+
+int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage) {
+    int count = cpu_info.total_logical_cpus / 2;
+    return count < 1 ? 1 : count;
+}
--- a/crypto/CryptonightR_gen.c
+++ b/crypto/CryptonightR_gen.c
@@ -0,0 +1,146 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "crypto/asm/CryptonightR_template.h"
+#include "persistent_memory.h"
+
+
+static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)())
+{
+    const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1);
+    if (size > 0) {
+        memcpy(*p, (const void *) p1, size);
+        *p += size;
+    }
+}
+
+
+static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM)
+{
+    uint32_t prev_rot_src = (uint32_t)(-1);
+
+    for (int i = 0;; ++i) {
+        const struct V4_Instruction inst = code[i];
+        if (inst.opcode == RET) {
+            break;
+        }
+
+        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+        uint8_t dst_index = inst.dst_index;
+        uint8_t src_index = inst.src_index;
+
+        const uint32_t a = inst.dst_index;
+        const uint32_t b = inst.src_index;
+        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+        switch (inst.opcode) {
+        case ROR:
+        case ROL:
+            if (b != prev_rot_src) {
+                prev_rot_src = b;
+                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
+            }
+            break;
+        }
+
+        if (a == prev_rot_src) {
+            prev_rot_src = (uint32_t)(-1);
+        }
+
+        void_func begin = instructions[c];
+
+        if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
+            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
+            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
+            uint8_t* prefix = (uint8_t*) begin;
+
+            if (*prefix == 0x49) {
+                **p = 0x41;
+                *p += 1;
+            }
+
+            begin = (void_func)(prefix + 1);
+        }
+
+        add_code(p, begin, instructions[c + 1]);
+
+        if (inst.opcode == ADD) {
+            *(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
+            if (is_64_bit) {
+                prev_rot_src = (uint32_t)(-1);
+            }
+        }
+    }
+}
+
+
+void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = machine_code;
+    uint8_t* p  = p0;
+
+    add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
+    add_code(&p, CryptonightR_template_part3, CryptonightR_template_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
+
+
+void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = (uint8_t*) machine_code;
+    uint8_t* p = p0;
+
+    add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
+    add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
+
+
+void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = machine_code;
+    uint8_t* p  = p0;
+
+    add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
+    add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
--- a/crypto/aesb.c
+++ b/crypto/aesb.c
@@ -1,170 +0,0 @@
-/*
---------------------------------------------------------------------------
-Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
-
-The redistribution and use of this software (with or without changes)
-is allowed without the payment of fees or royalties provided that:
-
-  source code distributions include the above copyright notice, this
-  list of conditions and the following disclaimer;
-
-  binary distributions include the above copyright notice, this list
-  of conditions and the following disclaimer in their documentation.
-
-This software is provided 'as is' with no explicit or implied warranties
-in respect of its operation, including, but not limited to, correctness
-and fitness for purpose.
---------------------------------------------------------------------------
-Issue Date: 20/12/2007
-*/
-
-#include <stdint.h>
-
-#include "aesb.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-#define TABLE_ALIGN     32
-#define WPOLY           0x011b
-#define N_COLS          4
-#define AES_BLOCK_SIZE  16
-#define RC_LENGTH       (5 * (AES_BLOCK_SIZE / 4 - 2))
-
-#if defined(_MSC_VER)
-#define ALIGN __declspec(align(TABLE_ALIGN))
-#elif defined(__GNUC__)
-#define ALIGN __attribute__ ((aligned(16)))
-#else
-#define ALIGN
-#endif
-
-#define rf1(r,c) (r)
-#define word_in(x,c) (*((uint32_t*)(x)+(c)))
-#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
-
-#define s(x,c) x[c]
-#define si(y,x,c) (s(y,c) = word_in(x, c))
-#define so(y,x,c) word_out(y, c, s(x,c))
-#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
-#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
-#define round(y,x,k) \
-y[0] = (k)[0]  ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
-y[1] = (k)[1]  ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
-y[2] = (k)[2]  ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
-y[3] = (k)[3]  ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
-#define to_byte(x) ((x) & 0xff)
-#define bval(x,n) to_byte((x) >> (8 * (n)))
-
-#define fwd_var(x,r,c)\
- ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
- : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
- : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
- :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
-
-#define fwd_rnd(y,x,k,c)  (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
-
-#define sb_data(w) {\
-    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
-
-#define rc_data(w) {\
-    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
-    w(0x1b), w(0x36) }
-
-#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
-    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
-
-#define h0(x)   (x)
-#define w0(p)   bytes2word(p, 0, 0, 0)
-#define w1(p)   bytes2word(0, p, 0, 0)
-#define w2(p)   bytes2word(0, 0, p, 0)
-#define w3(p)   bytes2word(0, 0, 0, p)
-
-#define u0(p)   bytes2word(f2(p), p, p, f3(p))
-#define u1(p)   bytes2word(f3(p), f2(p), p, p)
-#define u2(p)   bytes2word(p, f3(p), f2(p), p)
-#define u3(p)   bytes2word(p, p, f3(p), f2(p))
-
-#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
-#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
-#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
-#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
-
-#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
-#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
-#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
-#define f3(x)   (f2(x) ^ x)
-#define f9(x)   (f8(x) ^ x)
-#define fb(x)   (f8(x) ^ f2(x) ^ x)
-#define fd(x)   (f8(x) ^ f4(x) ^ x)
-#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-#define t_dec(m,n) t_##m##n
-#define t_set(m,n) t_##m##n
-#define t_use(m,n) t_##m##n
-
-#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
-
-#define four_tables(x,tab,vf,rf,c) \
-    (tab[0][bval(vf(x,0,c),rf(0,c))] \
-    ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
-    ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
-    ^ tab[3][bval(vf(x,3,c),rf(3,c))])
-
-d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
-
-inline void aesb_single_round(const uint8_t *restrict in, uint8_t *out, const uint8_t *restrict expandedKey) {
-    round(((uint32_t*) out), ((uint32_t*) in), ((uint32_t*) expandedKey));
-}
-
-inline void aesb_pseudo_round_mut(uint8_t *restrict val, const uint8_t *restrict expandedKey) {
-    uint32_t b1[4];
-    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey));
-    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 1 * N_COLS);
-    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 2 * N_COLS);
-    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 3 * N_COLS);
-    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 4 * N_COLS);
-    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 5 * N_COLS);
-    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 6 * N_COLS);
-    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 7 * N_COLS);
-    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 8 * N_COLS);
-    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 9 * N_COLS);
-}
-
-
-#if defined(__cplusplus)
-}
-#endif
--- a/crypto/aesb.h
+++ b/crypto/aesb.h
@@ -1,10 +0,0 @@
-#ifndef __AESB_H__
-#define __AESB_H__
-
-void aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-void aesb_pseudo_round_mut(uint8_t *val, const uint8_t *expandedKey);
-
-#define fast_aesb_single_round     aesb_single_round
-#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
-
-#endif /* __AESB_H__ */
--- a/crypto/asm/CryptonightR_soft_aes_template.inc
+++ b/crypto/asm/CryptonightR_soft_aes_template.inc
@@ -0,0 +1,279 @@
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_part1):
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	pxor xmm6, xmm1
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+	movaps xmm0, xmm5
+	psrldq xmm0, 8
+	movd r9d, xmm0
+
+FN_PREFIX(CryptonightR_soft_aes_template_part2):
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov edi, edi
+	shl rbp, 32
+	or rbp, rdi
+	xor r8, rbp
+
+	mov ebx, ebx
+	shl rsi, 32
+	or rsi, rbx
+	xor QWORD PTR [rsp+320], rsi
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor xmm6, xmm1
+	paddq	xmm1, xmm7
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+
+FN_PREFIX(CryptonightR_soft_aes_template_part3):
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+FN_PREFIX(CryptonightR_soft_aes_template_end):
--- a/crypto/asm/CryptonightR_template.S
+++ b/crypto/asm/CryptonightR_template.S
--- a/crypto/asm/CryptonightR_template.h
+++ b/crypto/asm/CryptonightR_template.h
--- a/crypto/asm/CryptonightR_template.inc
+++ b/crypto/asm/CryptonightR_template.inc
@@ -0,0 +1,531 @@
+PUBLIC FN_PREFIX(CryptonightR_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_end)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
+PUBLIC FN_PREFIX(CryptonightR_template_double_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_part1):
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_mainloop):
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+
+	mov	r13d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r13d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movaps xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [r13+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	pxor xmm0, xmm2
+	pxor xmm5, xmm1
+	pxor xmm5, xmm0
+
+	movq	r12, xmm5
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	paddq	xmm3, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r13+r11], xmm3
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+FN_PREFIX(CryptonightR_template_part2):
+	lea	rcx, [r10+r11]
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor rsp, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov	rax, r13
+	mul	r12
+	add	r15, rax
+	add	rsp, rdx
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	movaps xmm3, xmm1
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	movdqa	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm1, xmm2
+	pxor xmm5, xmm0
+	pxor xmm5, xmm1
+	paddq	xmm3, xmm4
+	paddq	xmm2, xmm6
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqu	XMMWORD PTR [r12+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm3
+
+	movdqa	xmm7, xmm6
+	mov	QWORD PTR [rcx], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [rcx+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	FN_PREFIX(CryptonightR_template_mainloop)
+
+FN_PREFIX(CryptonightR_template_part3):
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+FN_PREFIX(CryptonightR_template_end):
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_part1):
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_mainloop):
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm1
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	movq	rdx, xmm6
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm1
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+FN_PREFIX(CryptonightR_template_double_part2):
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r14, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r12, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rcx+rsi]
+	pxor	xmm6, xmm1
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	pxor	xmm6, xmm2
+	paddq	xmm2, xmm3
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	pxor	xmm6, xmm0
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+FN_PREFIX(CryptonightR_template_double_part3):
+
+	movq r15, xmm13
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r13, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	mov	rdi, rcx
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm1
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm2
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm0
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	FN_PREFIX(CryptonightR_template_double_mainloop)
+
+FN_PREFIX(CryptonightR_template_double_part4):
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+FN_PREFIX(CryptonightR_template_double_end):
--- a/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
+++ b/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN(64)
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
@@ -0,0 +1,180 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+cnv2_main_loop_bulldozer:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq xmm6, r8
+	pinsrq xmm6, r11, 1
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+
+	mov edi, 1023
+	shl rdi, 52
+
+	movq	r14, xmm5
+	pextrq rax, xmm5, 1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	div	r9
+	mov	eax, eax
+	shl	rdx, 32
+	lea	r15, [rax+rdx]
+	lea	rax, [r14+r15]
+	shr	rax, 12
+	add	rax, rdi
+	movq	xmm0, rax
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_bulldozer
+	shr	rdi, 19
+
+sqrt_fixup_bulldozer_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	cnv2_main_loop_bulldozer
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_bulldozer_endp
+
+sqrt_fixup_bulldozer:
+	movq r9, xmm5
+	add r9, r15
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_bulldozer_ret
+
+cnv2_main_loop_bulldozer_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
@@ -0,0 +1,186 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN(64)
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
@@ -0,0 +1,179 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
--- a/crypto/asm/cn_main_loop.S
+++ b/crypto/asm/cn_main_loop.S
@@ -0,0 +1,54 @@
+#ifdef __APPLE__
+#   define ALIGN(x) .align 6
+#else
+#   define ALIGN(x) .align 64
+#endif
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
+.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
+.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_ivybridge.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_ryzen.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_bulldozer_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_bulldozer.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cn2/cnv2_double_main_loop_sandybridge.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
--- a/crypto/asm/win64/cn_main_loop.S
+++ b/crypto/asm/win64/cn_main_loop.S
@@ -0,0 +1,31 @@
+#define ALIGN(x) .align 64
+.intel_syntax noprefix
+.section .text
+.global cnv2_mainloop_ivybridge_asm
+.global cnv2_mainloop_ryzen_asm
+.global cnv2_mainloop_bulldozer_asm
+.global cnv2_double_mainloop_sandybridge_asm
+
+ALIGN(64)
+cnv2_mainloop_ivybridge_asm:
+	#include "../cn2/cnv2_main_loop_ivybridge.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_ryzen_asm:
+	#include "../cn2/cnv2_main_loop_ryzen.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_bulldozer_asm:
+	#include "../cn2/cnv2_main_loop_bulldozer.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_double_mainloop_sandybridge_asm:
+	#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
+	ret 0
+	mov eax, 3735929054
--- a/crypto/oaes_config.h
+++ b/crypto/oaes_config.h
@@ -1,50 +0,0 @@
-/* 
- * ---------------------------------------------------------------------------
- * OpenAES License
- * ---------------------------------------------------------------------------
- * Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * 
- *   - Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *   - Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- */
-
-#ifndef _OAES_CONFIG_H
-#define _OAES_CONFIG_H
-
-#ifdef __cplusplus 
-extern "C" {
-#endif
-
-//#ifndef OAES_HAVE_ISAAC
-//#define OAES_HAVE_ISAAC 1
-//#endif // OAES_HAVE_ISAAC
-
-//#ifndef OAES_DEBUG
-//#define OAES_DEBUG 0
-//#endif // OAES_DEBUG
-
-#ifdef __cplusplus 
-}
-#endif
-
-#endif // _OAES_CONFIG_H
--- a/crypto/oaes_lib.c
+++ b/crypto/oaes_lib.c
--- a/crypto/oaes_lib.h
+++ b/crypto/oaes_lib.h
@@ -1,214 +0,0 @@
-/* 
- * ---------------------------------------------------------------------------
- * OpenAES License
- * ---------------------------------------------------------------------------
- * Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * 
- *   - Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *   - Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- */
-
-#ifndef _OAES_LIB_H
-#define _OAES_LIB_H
-
-#include <stdint.h>
-
-#ifdef __cplusplus 
-extern "C" {
-#endif
-
-#ifdef _WIN32
-#	ifdef OAES_SHARED
-#		ifdef oaes_lib_EXPORTS
-#			define OAES_API __declspec(dllexport)
-#		else
-#			define OAES_API __declspec(dllimport)
-#		endif
-#	else
-#		define OAES_API
-#	endif
-#else
-#	define OAES_API
-#endif // WIN32
-
-#define OAES_VERSION "0.8.1"
-#define OAES_BLOCK_SIZE 16
-
-typedef void OAES_CTX;
-
-typedef enum
-{
-	OAES_RET_FIRST = 0,
-	OAES_RET_SUCCESS = 0,
-	OAES_RET_UNKNOWN,
-	OAES_RET_ARG1,
-	OAES_RET_ARG2,
-	OAES_RET_ARG3,
-	OAES_RET_ARG4,
-	OAES_RET_ARG5,
-	OAES_RET_NOKEY,
-	OAES_RET_MEM,
-	OAES_RET_BUF,
-	OAES_RET_HEADER,
-	OAES_RET_COUNT
-} OAES_RET;
-
-/*
- * oaes_set_option() takes one of these values for its [option] parameter
- * some options accept either an optional or a required [value] parameter
- */
-// no option
-#define OAES_OPTION_NONE 0
-// enable ECB mode, disable CBC mode
-#define OAES_OPTION_ECB 1
-// enable CBC mode, disable ECB mode
-// value is optional, may pass uint8_t iv[OAES_BLOCK_SIZE] to specify
-// the value of the initialization vector, iv
-#define OAES_OPTION_CBC 2
-
-#ifdef OAES_DEBUG
-typedef int ( * oaes_step_cb ) (
-		const uint8_t state[OAES_BLOCK_SIZE],
-		const char * step_name,
-		int step_count,
-		void * user_data );
-// enable state stepping mode
-// value is required, must pass oaes_step_cb to receive the state at each step
-#define OAES_OPTION_STEP_ON 4
-// disable state stepping mode
-#define OAES_OPTION_STEP_OFF 8
-#endif // OAES_DEBUG
-
-typedef uint16_t OAES_OPTION;
-
-typedef struct _oaes_key
-{
-  size_t data_len;
-  uint8_t *data;
-  size_t exp_data_len;
-  uint8_t *exp_data;
-  size_t num_keys;
-  size_t key_base;
-} oaes_key;
-
-typedef struct _oaes_ctx
-{
-#ifdef OAES_HAVE_ISAAC
-  randctx * rctx;
-#endif // OAES_HAVE_ISAAC
-
-#ifdef OAES_DEBUG
-  oaes_step_cb step_cb;
-#endif // OAES_DEBUG
-
-  oaes_key * key;
-  OAES_OPTION options;
-  uint8_t iv[OAES_BLOCK_SIZE];
-} oaes_ctx;
-/*
- * // usage:
- * 
- * OAES_CTX * ctx = oaes_alloc();
- * .
- * .
- * .
- * {
- *   oaes_gen_key_xxx( ctx );
- *   {
- *     oaes_key_export( ctx, _buf, &_buf_len );
- *     // or
- *     oaes_key_export_data( ctx, _buf, &_buf_len );\
- *   }
- * }
- * // or
- * {
- *   oaes_key_import( ctx, _buf, _buf_len );
- *   // or
- *   oaes_key_import_data( ctx, _buf, _buf_len );
- * }
- * .
- * .
- * .
- * oaes_encrypt( ctx, m, m_len, c, &c_len );
- * .
- * .
- * .
- * oaes_decrypt( ctx, c, c_len, m, &m_len );
- * .
- * .
- * .
- * oaes_free( &ctx );
- */
-
-OAES_API OAES_CTX * oaes_alloc(void);
-
-OAES_API OAES_RET oaes_free( OAES_CTX ** ctx );
-
-OAES_API OAES_RET oaes_set_option( OAES_CTX * ctx,
-		OAES_OPTION option, const void * value );
-
-OAES_API OAES_RET oaes_key_gen_128( OAES_CTX * ctx );
-
-OAES_API OAES_RET oaes_key_gen_192( OAES_CTX * ctx );
-
-OAES_API OAES_RET oaes_key_gen_256( OAES_CTX * ctx );
-
-// export key with header information
-// set data == NULL to get the required data_len
-OAES_API OAES_RET oaes_key_export( OAES_CTX * ctx,
-		uint8_t * data, size_t * data_len );
-
-// directly export the data from key
-// set data == NULL to get the required data_len
-OAES_API OAES_RET oaes_key_export_data( OAES_CTX * ctx,
-		uint8_t * data, size_t * data_len );
-
-// import key with header information
-OAES_API OAES_RET oaes_key_import( OAES_CTX * ctx,
-		const uint8_t * data, size_t data_len );
-
-// directly import data into key
-OAES_API OAES_RET oaes_key_import_data( OAES_CTX * ctx,
-		const uint8_t * data, size_t data_len );
-
-// set c == NULL to get the required c_len
-OAES_API OAES_RET oaes_encrypt( OAES_CTX * ctx,
-		const uint8_t * m, size_t m_len, uint8_t * c, size_t * c_len );
-
-// set m == NULL to get the required m_len
-OAES_API OAES_RET oaes_decrypt( OAES_CTX * ctx,
-		const uint8_t * c, size_t c_len, uint8_t * m, size_t * m_len );
-
-// set buf == NULL to get the required buf_len
-OAES_API OAES_RET oaes_sprintf(
-		char * buf, size_t * buf_len, const uint8_t * data, size_t data_len );
-
-OAES_API OAES_RET oaes_encryption_round( const uint8_t * key, uint8_t * c );
-
-OAES_API OAES_RET oaes_pseudo_encrypt_ecb( OAES_CTX * ctx, uint8_t * c );
-
-#ifdef __cplusplus 
-}
-#endif
-
-#endif // _OAES_LIB_H
--- a/crypto/soft_aes.h
+++ b/crypto/soft_aes.h
@@ -0,0 +1,131 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+/*
+ * Parts of this file are originally copyright (c) 2014-2017, The Monero Project
+ */
+#pragma once
+
+
+#if defined(XMRIG_ARM)
+#   include "crypto/SSE2NEON.h"
+#elif defined(__GNUC__)
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#endif
+
+#include <inttypes.h>
+
+
+#define saes_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define SAES_WPOLY           0x011b
+
+#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
+    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+
+#define saes_f2(x)   ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
+#define saes_f3(x)   (saes_f2(x) ^ x)
+#define saes_h0(x)   (x)
+
+#define saes_u0(p)   saes_b2w(saes_f2(p),          p,          p, saes_f3(p))
+#define saes_u1(p)   saes_b2w(saes_f3(p), saes_f2(p),          p,          p)
+#define saes_u2(p)   saes_b2w(         p, saes_f3(p), saes_f2(p),          p)
+#define saes_u3(p)   saes_b2w(         p,          p, saes_f3(p), saes_f2(p))
+
+__attribute__((aligned(16))) const static uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
+__attribute__((aligned(16))) const static uint8_t  saes_sbox[256] = saes_data(saes_h0);
+
+
+static inline __m128i soft_aesenc(__m128i in, __m128i key)
+{
+    uint32_t x0, x1, x2, x3;
+    x0 = _mm_cvtsi128_si32(in);
+    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
+        (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
+        (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
+        (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, key);
+}
+
+static inline uint32_t sub_word(uint32_t key)
+{
+    return (saes_sbox[key >> 24 ] << 24)   | 
+        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
+        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+         saes_sbox[key & 0xff];
+}
+
+#if defined(__clang__) || defined(XMRIG_ARM)
+static inline uint32_t _rotr(uint32_t value, uint32_t amount)
+{
+    return (value >> amount) | (value << ((32 - amount) & 31));
+}
+#endif
+
+
+static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
+{
+    const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
+    const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
+    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
+}
--- a/donate.h
+++ b/donate.h
@@ -24,6 +24,6 @@
 #ifndef __DONATE_H__
 #define __DONATE_H__

-#define DONATE_LEVEL 5
+#define DONATE_LEVEL 0

 #endif /* __DONATE_H__ */
--- a/mac/cpu_mac.c
+++ b/mac/cpu_mac.c
@@ -0,0 +1,47 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ *
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <unistd.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include "cpu.h"
+
+
+struct cpu_info cpu_info = { 0 };
+void cpu_init_common();
+
+
+void cpu_init() {
+#   ifdef XMRIG_NO_LIBCPUID
+    cpu_info.total_logical_cpus = sysconf(_SC_NPROCESSORS_CONF);
+#   endif
+    
+    cpu_init_common();
+}
+
+
+int affine_to_cpu_mask(int id, unsigned long mask)
+{
+    return 0;
+}
--- a/mac/memory_mac.c
+++ b/mac/memory_mac.c
@@ -0,0 +1,95 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <mm_malloc.h>
+#include <sys/mman.h>
+#include <mach/vm_statistics.h>
+
+#include "persistent_memory.h"
+#include "options.h"
+#include "utils/applog.h"
+
+char *persistent_memory;
+int persistent_memory_flags = 0;
+
+
+const char * persistent_memory_allocate() {
+    const int ratio = (opt_double_hash && opt_algo != ALGO_CRYPTONIGHT_LITE) ? 2 : 1;
+    const int size = MEMORY * (opt_n_threads * ratio + 1);
+    persistent_memory_flags |= MEMORY_HUGEPAGES_AVAILABLE;
+    
+    persistent_memory = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+    
+    if (persistent_memory == MAP_FAILED) {
+        persistent_memory = _mm_malloc(size, 16);
+        return persistent_memory;
+    }
+    
+    persistent_memory_flags |= MEMORY_HUGEPAGES_ENABLED;
+    
+    if (madvise(persistent_memory, size, MADV_RANDOM | MADV_WILLNEED) != 0) {
+        applog(LOG_ERR, "madvise failed");
+    }
+    
+    if (mlock(persistent_memory, size) == 0) {
+        persistent_memory_flags |= MEMORY_LOCK;
+    }
+    
+    return persistent_memory;
+}
+
+
+void persistent_memory_free() {
+    const int size = MEMORY * (opt_n_threads + 1);
+    
+    if (persistent_memory_flags & MEMORY_HUGEPAGES_ENABLED) {
+        if (persistent_memory_flags & MEMORY_LOCK) {
+            munlock(persistent_memory, size);
+        }
+        
+        munmap(persistent_memory, size);
+    }
+    else {
+        _mm_free(persistent_memory);
+    }
+}
+
+
+void *allocate_executable_memory(size_t size)
+{
+    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    mprotect(p, size, PROT_READ | PROT_EXEC);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+    __builtin___clear_cache((char*) p, (char*)(p) + size);
+}
--- a/mac/xmrig_mac.c
+++ b/mac/xmrig_mac.c
@@ -0,0 +1,91 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ *
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "options.h"
+#include "cpu.h"
+#include "utils/applog.h"
+
+
+static void signal_handler(int sig)
+{
+    switch (sig) {
+        case SIGHUP:
+            applog(LOG_WARNING, "SIGHUP received");
+            break;
+            
+        case SIGINT:
+            applog(LOG_WARNING, "SIGINT received, exiting");
+            proper_exit(0);
+            break;
+            
+        case SIGTERM:
+            applog(LOG_WARNING, "SIGTERM received, exiting");
+            proper_exit(0);
+            break;
+    }
+}
+
+
+void proper_exit(int reason) {
+    exit(reason);
+}
+
+
+void os_specific_init()
+{
+    if (opt_affinity != -1) {
+        affine_to_cpu_mask(-1, opt_affinity);
+    }
+    
+    if (opt_background) {
+        int i = fork();
+        if (i < 0) {
+            exit(1);
+        }
+        
+        if (i > 0) {
+            exit(0);
+        }
+        
+        i = setsid();
+        
+        if (i < 0) {
+            applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
+        }
+        
+        i = chdir("/");
+        if (i < 0) {
+            applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
+        }
+        
+        signal(SIGHUP, signal_handler);
+        signal(SIGTERM, signal_handler);
+    }
+    
+    signal(SIGINT, signal_handler);
+}
--- a/memory.c
+++ b/memory.c
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -24,11 +25,15 @@
 #include <string.h>

 #include "persistent_memory.h"
+#include "options.h"
+

 static size_t offset = 0;


 void * persistent_calloc(size_t num, size_t size) {
+    size += size % 16;
+
    void *mem = &persistent_memory[offset];
    offset += (num * size);

@@ -36,3 +41,31 @@ void * persistent_calloc(size_t num, size_t size) {

    return mem;
 }
+
+
+void init_cn_r(struct cryptonight_ctx *ctx)
+{
+    uint8_t *p = allocate_executable_memory(0x4000);
+
+    ctx->generated_code        = (cn_mainloop_fun_ms_abi) p;
+    ctx->generated_code_double = (cn_mainloop_double_fun_ms_abi)(p + 0x2000);
+    ctx->generated_code_height = ctx->generated_code_double_height = (uint64_t)(-1);
+    ctx->height                = 0;
+}
+
+
+void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id)
+{
+    const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1;
+    ctx[0]          = persistent_calloc(1, sizeof(struct cryptonight_ctx));
+    ctx[0]->memory  = &persistent_memory[MEMORY * (thr_id * ratio + 1)];
+
+    init_cn_r(ctx[0]);
+
+    if (opt_double_hash) {
+        ctx[1]         = persistent_calloc(1, sizeof(struct cryptonight_ctx));
+        ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE);
+
+        init_cn_r(ctx[1]);
+    }
+}
--- a/options.c
+++ b/options.c
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -36,25 +37,61 @@
 #include "algo/cryptonight/cryptonight.h"


-int64_t opt_affinity     = -1L;
-int     opt_n_threads    = 0;
-int     opt_algo_variant = 0;
-int     opt_retries      = 5;
-int     opt_retry_pause  = 5;
-int     opt_donate_level = DONATE_LEVEL;
-bool    opt_colors       = true;
-bool    opt_keepalive    = false;
-bool    opt_background   = false;
-char    *opt_url         = NULL;
-char    *opt_backup_url  = NULL;
-char    *opt_userpass    = NULL;
-char    *opt_user        = NULL;
-char    *opt_pass        = NULL;
+int64_t opt_affinity      = -1L;
+int     opt_n_threads     = 0;
+int     opt_retries       = 5;
+int     opt_retry_pause   = 5;
+int     opt_donate_level  = DONATE_LEVEL;
+int     opt_max_cpu_usage = 75;
+bool    opt_colors        = true;
+bool    opt_keepalive     = false;
+bool    opt_background    = false;
+bool    opt_double_hash   = false;
+bool    opt_safe          = false;
+bool    opt_nicehash      = false;
+char    *opt_url          = NULL;
+char    *opt_backup_url   = NULL;
+char    *opt_userpass     = NULL;
+char    *opt_user         = NULL;
+char    *opt_pass         = NULL;
+
+enum Algo opt_algo         = ALGO_CRYPTONIGHT;
+enum Variant opt_variant   = VARIANT_AUTO;
+enum AlgoVariant opt_av    = AV_AUTO;
+enum Assembly opt_assembly = ASM_AUTO;
+
+
+struct AlgoData
+{
+    const char *name;
+    const char *shortName;
+    enum Algo algo;
+    enum Variant variant;
+};
+
+
+static struct AlgoData const algorithms[] = {
+    { "cryptonight",           "cn",           ALGO_CRYPTONIGHT,       VARIANT_AUTO },
+    { "cryptonight/0",         "cn/0",         ALGO_CRYPTONIGHT,       VARIANT_0    },
+    { "cryptonight/1",         "cn/1",         ALGO_CRYPTONIGHT,       VARIANT_1    },
+    { "cryptonight/2",         "cn/2",         ALGO_CRYPTONIGHT,       VARIANT_2    },
+    { "cryptonight/4",         "cn/4",         ALGO_CRYPTONIGHT,       VARIANT_4    },
+    { "cryptonight/r",         "cn/r",         ALGO_CRYPTONIGHT,       VARIANT_4    },
+
+#   ifndef XMRIG_NO_AEON
+    { "cryptonight-lite",      "cn-lite",      ALGO_CRYPTONIGHT_LITE,  VARIANT_AUTO },
+    { "cryptonight-light",     "cn-light",     ALGO_CRYPTONIGHT_LITE,  VARIANT_AUTO },
+    { "cryptonight-lite/0",    "cn-lite/0",    ALGO_CRYPTONIGHT_LITE,  VARIANT_0    },
+    { "cryptonight-lite/1",    "cn-lite/1",    ALGO_CRYPTONIGHT_LITE,  VARIANT_1    },
+#   endif
+};


 static char const usage[] = "\
 Usage: " APP_ID " [OPTIONS]\n\
 Options:\n\
+  -a, --algo=ALGO       cryptonight (default) or cryptonight-lite\n\
+      --variant=N       cryptonight variant: 0-4\n\
  -o, --url=URL         URL of mining server\n\
  -b, --backup-url=URL  URL of backup mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
@@ -65,11 +102,14 @@ Options:\n\
  -k, --keepalive       send keepalived for prevent timeout (need pool support)\n\
  -r, --retries=N       number of times to retry before switch to backup server (default: 5)\n\
  -R, --retry-pause=N   time to pause between retries (default: 5)\n\
-      --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
+      --cpu-affinity    set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\
      --no-color        disable colored output\n\
      --donate-level=N  donate level, default 5%% (5 minutes in 100 minutes)\n\
  -B, --background      run the miner in the background\n\
  -c, --config=FILE     load a JSON-format configuration file\n\
+      --max-cpu-usage=N maximum CPU usage for automatic threads mode (default 75)\n\
+      --safe            safe adjust threads and av settings for current CPU\n\
+      --nicehash        enable nicehash support\n\
  -h, --help            display this help and exit\n\
  -V, --version         output version information and exit\n\
 ";
@@ -79,42 +119,90 @@ static char const short_options[] = "a:c:khBp:Px:r:R:s:t:T:o:u:O:v:Vb:";


 static struct option const options[] = {
-    { "algo",         1, NULL, 'a'  },
-    { "av",           1, NULL, 'v'  },
-    { "background",   0, NULL, 'B'  },
-    { "backup-url",   1, NULL, 'b'  },
-    { "config",       1, NULL, 'c'  },
-    { "cpu-affinity", 1, NULL, 1020 },
-    { "donate-level", 1, NULL, 1003 },
-    { "help",         0, NULL, 'h'  },
-    { "keepalive",    0, NULL ,'k'  },
-    { "no-color",     0, NULL, 1002 },
-    { "pass",         1, NULL, 'p'  },
-    { "retries",      1, NULL, 'r'  },
-    { "retry-pause",  1, NULL, 'R'  },
-    { "threads",      1, NULL, 't'  },
-    { "url",          1, NULL, 'o'  },
-    { "user",         1, NULL, 'u'  },
-    { "userpass",     1, NULL, 'O'  },
-    { "version",      0, NULL, 'V'  },
-    { 0, 0, 0, 0 }
+    { "algo",          1, NULL, 'a'  },
+    { "av",            1, NULL, 'v'  },
+    { "background",    0, NULL, 'B'  },
+    { "backup-url",    1, NULL, 'b'  },
+    { "config",        1, NULL, 'c'  },
+    { "cpu-affinity",  1, NULL, 1020 },
+    { "donate-level",  1, NULL, 1003 },
+    { "help",          0, NULL, 'h'  },
+    { "keepalive",     0, NULL, 'k'  },
+    { "max-cpu-usage", 1, NULL, 1004 },
+    { "nicehash",      0, NULL, 1006 },
+    { "no-color",      0, NULL, 1002 },
+    { "pass",          1, NULL, 'p'  },
+    { "retries",       1, NULL, 'r'  },
+    { "retry-pause",   1, NULL, 'R'  },
+    { "safe",          0, NULL, 1005 },
+    { "threads",       1, NULL, 't'  },
+    { "url",           1, NULL, 'o'  },
+    { "user",          1, NULL, 'u'  },
+    { "userpass",      1, NULL, 'O'  },
+    { "version",       0, NULL, 'V'  },
+    { "variant",       1, NULL, 1021 },
+    { "asm",           1, NULL, 1022 },
+    { NULL,            0, NULL, 0    }
 };


-static int get_algo_variant(int variant) {
-   if (variant > XMR_VARIANT_AUTO && variant < XMR_VARIANT_MAX) {
-       return variant;
-   }
+static const char *algo_names[] = {
+    "cryptonight",
+#   ifndef XMRIG_NO_AEON
+    "cryptonight-lite"
+#   endif
+};

-   if (cpu_info.flags & CPU_FLAG_AES) {
-       if (cpu_info.flags & CPU_FLAG_BMI2) {
-           return XMR_VARIANT_AESNI_BMI2;
-       }

-       return XMR_VARIANT_AESNI;
-   }
+static const char *variant_names[] = {
+    "auto",
+    "0",
+    "1",
+    "2",
+    "4"
+};

-   return XMR_VARIANT_LEGACY;
+
+static const char *asm_names[] = {
+    "none",
+    "auto",
+    "intel",
+    "ryzen",
+    "bulldozer"
+};
+
+
+#ifndef XMRIG_NO_AEON
+static int get_cryptonight_lite_variant(int variant) {
+    if (variant <= AV_AUTO || variant >= AV_MAX) {
+        return (cpu_info.flags & CPU_FLAG_AES) ? AV_DOUBLE : AV_DOUBLE_SOFT;
+    }
+
+    if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
+        return variant + 2;
+    }
+
+    return variant;
+}
+#endif
+
+
+static int get_algo_variant(int algo, int variant) {
+#   ifndef XMRIG_NO_AEON
+    if (algo == ALGO_CRYPTONIGHT_LITE) {
+        return get_cryptonight_lite_variant(variant);
+    }
+#   endif
+
+    if (variant <= AV_AUTO || variant >= AV_MAX) {
+        return (cpu_info.flags & CPU_FLAG_AES) ? AV_SINGLE : AV_SINGLE_SOFT;
+    }
+
+    if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
+        return variant + 2;
+    }
+
+    return variant;
 }


@@ -129,7 +217,22 @@ static void parse_arg(int key, char *arg) {

    switch (key)
    {
-    case 'a':
+    case 'a': /* --algo */
+        for (size_t i = 0; i < ARRAY_SIZE(algorithms); i++) {
+            if ((strcasecmp(arg, algorithms[i].name) == 0) || (strcasecmp(arg, algorithms[i].shortName) == 0)) {
+                opt_algo    = algorithms[i].algo;
+                opt_variant = algorithms[i].variant;
+                break;
+            }
+        }
+        break;
+
+    case 1022: /* --asm */
+        for (size_t i = 0; i < ARRAY_SIZE(asm_names); i++) {
+            if (strcasecmp(arg, asm_names[i]) == 0) {
+                opt_assembly = i;
+            }
+        }
        break;

    case 'O': /* --userpass */
@@ -200,7 +303,20 @@ static void parse_arg(int key, char *arg) {
        opt_n_threads = v;
        break;

-    case 'k':
+    case 1004: /* --max-cpu-usage */
+        v = atoi(arg);
+        if (v < 1 || v > 100) {
+            show_usage_and_exit(1);
+        }
+
+        opt_max_cpu_usage = v;
+        break;
+
+    case 1005: /* --safe */
+        opt_safe = true;
+        break;
+
+    case 'k': /* --keepalive */
        opt_keepalive = true;
        break;

@@ -230,24 +346,24 @@ static void parse_arg(int key, char *arg) {
        break;
    }

-    case 'B':
+    case 'B': /* --background */
        opt_background = true;
        opt_colors = false;
        break;

    case 'v': /* --av */
        v = atoi(arg);
-        if (v < 0 || v > XMR_VARIANT_MAX) {
+        if (v <= AV_AUTO || v >= AV_MAX) {
            show_usage_and_exit(1);
        }

-        opt_algo_variant = v;
+        opt_av = v;
        break;

    case 1020: /* --cpu-affinity */
        p  = strstr(arg, "0x");
        ul = p ? strtoul(p, NULL, 16) : atol(arg);
-        if (ul > (1UL << cpu_info.count) -1) {
+        if (ul > (1UL << cpu_info.total_logical_cpus) -1) {
            ul = -1;
        }

@@ -258,13 +374,28 @@ static void parse_arg(int key, char *arg) {
        opt_colors = false;
        break;

-    case 1003:
+    case 1003: /* --donate-level */
+//        v = atoi(arg);
+//        if (v < 1 || v > 99) {
+//            show_usage_and_exit(1);
+//        }
+
+//        opt_donate_level = v;
+        break;
+
+    case 1021: /* --variant */
        v = atoi(arg);
-        if (v < 1 || v > 99) {
-            show_usage_and_exit(1);
+        if (v == 4 || strcasecmp(arg, "r") == 0) {
+            opt_variant = VARIANT_4;
+        }
+        else if (v > VARIANT_AUTO && v < VARIANT_MAX) {
+            opt_variant = v;
        }

-        opt_donate_level = v;
+        break;
+
+    case 1006: /* --nicehash */
+        opt_nicehash = true;
        break;

    default:
@@ -275,7 +406,7 @@ static void parse_arg(int key, char *arg) {

 static void parse_config(json_t *config, char *ref)
 {
-    int i;
+    size_t i;
    char buf[16];
    json_t *val;

@@ -336,7 +467,7 @@ static char *parse_url(const char *arg)
        show_usage_and_exit(1);
    }

-    char *dest = malloc(strlen(arg) + 14);
+    char *dest = malloc(strlen(arg) + 16);
    sprintf(dest, "stratum+tcp://%s", arg);

    return dest;
@@ -367,12 +498,12 @@ void parse_cmdline(int argc, char *argv[]) {
    }

    if (!opt_url) {
-        opt_url = strdup("stratum+tcp://proxy.xmrig.com:443");
-        opt_keepalive = true;
+        applog_notime(LOG_ERR, "No pool URL supplied. Exiting.\n", argv[0]);
+        proper_exit(1);
+    }

-        if (!opt_backup_url) {
-            opt_backup_url = strdup("stratum+tcp://failover.xmrig.com:80");
-        }
+    if (strstr(opt_url, ".nicehash.com:") != NULL) {
+        opt_nicehash = true;
    }

    if (!opt_userpass) {
@@ -384,20 +515,23 @@ void parse_cmdline(int argc, char *argv[]) {
        sprintf(opt_userpass, "%s:%s", opt_user, opt_pass);
    }

+    opt_av = get_algo_variant(opt_algo, opt_av);
+
+    if (!cryptonight_init(opt_av)) {
+        applog(LOG_ERR, "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
+        proper_exit(1);
+    }
+
    if (!opt_n_threads) {
-        opt_n_threads = get_optimal_threads_count();
+        opt_n_threads = get_optimal_threads_count(opt_algo, opt_double_hash, opt_max_cpu_usage);
    }

-    opt_algo_variant = get_algo_variant(opt_algo_variant);
-    if (!opt_algo_variant) {
-        opt_algo_variant = get_algo_variant(0);
+    if (opt_safe) {
+        const int count = get_optimal_threads_count(opt_algo, opt_double_hash, opt_max_cpu_usage);
+        if (opt_n_threads > count) {
+            opt_n_threads = count;
+        }
    }
-
-    if (opt_donate_level < 1 || opt_donate_level > 99) {
-        opt_donate_level = 1;
-    }
-
-    cryptonight_init(opt_algo_variant);
 }


@@ -439,3 +573,14 @@ void show_version_and_exit(void) {
    #endif
    proper_exit(0);
 }
+
+
+const char *get_current_algo_name(void) {
+    return algo_names[opt_algo];
+}
+
+
+const char *get_current_variant_name(void)
+{
+    return variant_names[opt_variant + 1];
+}
--- a/options.h
+++ b/options.h
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -21,48 +22,84 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __OPTIONS_H__
-#define __OPTIONS_H__
+#ifndef XMRIG_OPTIONS_H
+#define XMRIG_OPTIONS_H

 #include <stdbool.h>
 #include <stdint.h>

+
 #ifndef ARRAY_SIZE
 #   define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif


-enum xmr_algo_variant {
-    XMR_VARIANT_AUTO,
-    XMR_VARIANT_AESNI,
-    XMR_VARIANT_AESNI_WOLF,
-    XMR_VARIANT_AESNI_BMI2,
-    XMR_VARIANT_LEGACY,
-    XMR_VARIANT_EXPERIMENTAL,
-    XMR_VARIANT_MAX
+enum Algo {
+    ALGO_CRYPTONIGHT,      /* CryptoNight (Monero) */
+    ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (AEON) */
+};
+
+
+enum Variant {
+    VARIANT_AUTO = -1,
+    VARIANT_0    = 0,
+    VARIANT_1    = 1,
+    VARIANT_2    = 2,
+    VARIANT_4    = 3,
+    VARIANT_MAX
+};
+
+
+enum AlgoVariant {
+    AV_AUTO,        // --av=0 Automatic mode.
+    AV_SINGLE,      // --av=1  Single hash mode
+    AV_DOUBLE,      // --av=2  Double hash mode
+    AV_SINGLE_SOFT, // --av=3  Single hash mode (Software AES)
+    AV_DOUBLE_SOFT, // --av=4  Double hash mode (Software AES)
+    AV_MAX
+};
+
+
+enum Assembly {
+    ASM_NONE,
+    ASM_AUTO,
+    ASM_INTEL,
+    ASM_RYZEN,
+    ASM_BULLDOZER,
+    ASM_MAX
 };


 extern bool opt_colors;
 extern bool opt_keepalive;
 extern bool opt_background;
+extern bool opt_double_hash;
+extern bool opt_safe;
+extern bool opt_nicehash;
 extern char *opt_url;
 extern char *opt_backup_url;
 extern char *opt_userpass;
 extern char *opt_user;
 extern char *opt_pass;
 extern int opt_n_threads;
-extern int opt_algo_variant;
 extern int opt_retry_pause;
 extern int opt_retries;
 extern int opt_donate_level;
+extern int opt_max_cpu_usage;
 extern int64_t opt_affinity;

+extern enum Algo opt_algo;
+extern enum Variant opt_variant;
+extern enum AlgoVariant opt_av;
+extern enum Assembly opt_assembly;
+
 void parse_cmdline(int argc, char *argv[]);
 void show_usage_and_exit(int status);
 void show_version_and_exit(void);
+const char *get_current_algo_name(void);
+const char *get_current_variant_name(void);

 extern void proper_exit(int reason);


-#endif /* __OPTIONS_H__ */
+#endif /* XMRIG_OPTIONS_H */
--- a/persistent_memory.h
+++ b/persistent_memory.h
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -21,12 +22,16 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __PERSISTENT_MEMORY_H__
-#define __PERSISTENT_MEMORY_H__
+#ifndef XMRIG_PERSISTENT_MEMORY_H
+#define XMRIG_PERSISTENT_MEMORY_H
+

 #include <stddef.h>


+#include "algo/cryptonight/cryptonight.h"
+
+
 enum memory_flags {
    MEMORY_HUGEPAGES_AVAILABLE = 1,
    MEMORY_HUGEPAGES_ENABLED   = 2,
@@ -34,7 +39,7 @@ enum memory_flags {
 };


-#define TWO_MB_PAGE 2097152
+#define MEMORY 2097152


 extern char *persistent_memory;
@@ -43,7 +48,15 @@ extern int persistent_memory_flags;

 const char * persistent_memory_allocate();
 void persistent_memory_free();
-void * persistent_calloc(size_t num, size_t size);
+void *persistent_calloc(size_t num, size_t size);
+void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id);


-#endif /* __PERSISTENT_MEMORY_H__ */
+void *allocate_executable_memory(size_t size);
+void flush_instruction_cache(void *p, size_t size);
+void init_cn_r(struct cryptonight_ctx *ctx);
+void protect_executable_memory(void *p, size_t size);
+
+
+
+#endif /* XMRIG_PERSISTENT_MEMORY_H */
--- a/stratum.c
+++ b/stratum.c
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -36,11 +37,16 @@
 #   include <poll.h>
 #endif

-#include "stratum.h"
-#include "version.h"
+#ifdef __APPLE_CC__
+#   include <netinet/in.h>
+#endif
+
+#include "options.h"
 #include "stats.h"
+#include "stratum.h"
 #include "util.h"
 #include "utils/applog.h"
+#include "version.h"


 #ifdef WIN32
@@ -58,6 +64,9 @@
 #define unlikely(expr) (__builtin_expect(!!(expr), 0))


+static struct work work;
+
+
 static bool send_line(curl_socket_t sock, char *s);
 static bool socket_full(curl_socket_t sock, int timeout);
 static void buffer_append(struct stratum_ctx *sctx, const char *s);
@@ -66,7 +75,8 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, curlsocktype p
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, struct curl_sockaddr *addr);
 static int closesocket_cb(void *clientp, curl_socket_t item);
 static bool login_decode(struct stratum_ctx *sctx, const json_t *val);
-static bool job_decode(struct stratum_ctx *sctx, const json_t *job);
+static void extensions_decode(const json_t *val);
+static bool job_decode(const json_t *job);
 static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen);


@@ -235,13 +245,19 @@ bool stratum_handle_response(char *buf) {
    json_t *id_val = json_object_get(val, "id");

    if (!id_val || json_is_null(id_val) || !res_val) {
-       json_decref(val);
-       return false;
+        const char* message;
+
+        if (json_is_object(err_val) && (message = json_string_value(json_object_get(err_val, "message")))) {
+            applog(LOG_ERR, "error: \"%s\"", message);
+        }
+
+        json_decref(val);
+        return false;
    }

    json_t *status = json_object_get(res_val, "status");

-    if (!strcmp(json_string_value(status), "KEEPALIVED") ) {
+    if (status && !strcmp(json_string_value(status), "KEEPALIVED") ) {
        applog(LOG_DEBUG, "Keepalived receveid");
        json_decref(val);
        return true;
@@ -285,7 +301,6 @@ bool stratum_keepalived(struct stratum_ctx *sctx)
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
 {
    char *sret;
-    json_error_t err;

    char *req = malloc(128 + strlen(user) + strlen(pass));
    sprintf(req, "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\",\"agent\":\"%s/%s\"},\"id\":1}", user, pass, APP_NAME, APP_VERSION);
@@ -321,19 +336,24 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
    json_t *error  = json_object_get(val, "error");

    if (!result || json_is_false(result) || (error && !json_is_null(error)))  {
-        applog(LOG_ERR, "Stratum authentication failed");
+        const char* message;
+
+        if (json_is_object(error) && (message = json_string_value(json_object_get(error, "message")))) {
+            applog(LOG_ERR, "Stratum authentication failed: \"%s\"", message);
+        }
+        else {
+            applog(LOG_ERR, "Stratum authentication failed");
+        }
+
        json_decref(val);
        return false;
    }

-    login_decode(sctx, val);
-    json_t *job = json_object_get(result, "job");
-
-    pthread_mutex_lock(&sctx->work_lock);
-    if (job) {
-        job_decode(sctx, job);
+    if (login_decode(sctx, val) && job(sctx, json_object_get(result, "job"))) {
+        pthread_mutex_lock(&sctx->sock_lock);
+        sctx->ready = true;
+        pthread_mutex_unlock(&sctx->sock_lock);
    }
-    pthread_mutex_unlock(&sctx->work_lock);

    json_decref(val);
    return true;
@@ -492,11 +512,20 @@ static void buffer_append(struct stratum_ctx *sctx, const char *s)
 */
 static bool job(struct stratum_ctx *sctx, json_t *params)
 {
-    bool ret = false;
+    if (!job_decode(params)) {
+        return false;
+    }
+
    pthread_mutex_lock(&sctx->work_lock);
-    ret = job_decode(sctx, params);
+
+    if (sctx->work.target != work.target) {
+        stats_set_target(work.target);
+    }
+
+    memcpy(&sctx->work, &work, sizeof(struct work));
    pthread_mutex_unlock(&sctx->work_lock);
-    return ret;
+
+    return true;
 }


@@ -515,31 +544,36 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, curlsocktype p
    int tcp_keepintvl = 50;

 #ifndef WIN32
-    if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
-        sizeof(keepalive))))
+    if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive)))) {
        return 1;
-#ifdef __linux
-    if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
-        &tcp_keepcnt, sizeof(tcp_keepcnt))))
+    }
+    
+#   ifdef __linux
+    if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &tcp_keepcnt, sizeof(tcp_keepcnt)))) {
        return 1;
-    if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
-        &tcp_keepidle, sizeof(tcp_keepidle))))
+    }
+    
+    if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &tcp_keepidle, sizeof(tcp_keepidle)))) {
        return 1;
-    if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
-        &tcp_keepintvl, sizeof(tcp_keepintvl))))
+    }
+    
+    if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &tcp_keepintvl, sizeof(tcp_keepintvl)))) {
        return 1;
-#endif /* __linux */
-#ifdef __APPLE_CC__
-    if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
-        &tcp_keepintvl, sizeof(tcp_keepintvl))))
+    }
+#   endif /* __linux */
+    
+#   ifdef __APPLE_CC__
+    if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &tcp_keepintvl, sizeof(tcp_keepintvl)))) {
        return 1;
-#endif /* __APPLE_CC__ */
+    }
+#   endif /* __APPLE_CC__ */
 #else /* WIN32 */
    struct tcp_keepalive vals;
    vals.onoff = 1;
    vals.keepalivetime = tcp_keepidle * 1000;
    vals.keepaliveinterval = tcp_keepintvl * 1000;
    DWORD outputBytes;
+    
    if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), NULL, 0, &outputBytes, NULL, NULL))) {
        return 1;
    }
@@ -584,33 +618,23 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
        return false;
    }

-    json_t *tmp = json_object_get(res, "id");
-    if (!tmp) {
+    const char *id = json_string_value(json_object_get(res, "id"));
+    if (!id || strlen(id) >= (sizeof(sctx->id))) {
        applog(LOG_ERR, "JSON invalid id");
        return false;
    }

-    const char *id = json_string_value(tmp);
-    if (!id) {
-        applog(LOG_ERR, "JSON id is not a string");
-        return false;
-    }
+    memset(&sctx->id, 0, sizeof(sctx->id));
+    memcpy(&sctx->id, id, strlen(id));

-    memcpy(&sctx->id, id, 64);
-
-    pthread_mutex_lock(&sctx->sock_lock);
-    sctx->ready = true;
-    pthread_mutex_unlock(&sctx->sock_lock);
-
-    tmp = json_object_get(res, "status");
-    if (!tmp) {
-        applog(LOG_ERR, "JSON invalid status");
-        return false;
-    }
-
-    const char *s = json_string_value(tmp);
+    const char *s = json_string_value(json_object_get(res, "status"));
    if (!s) {
-        applog(LOG_ERR, "JSON status is not a string");
+        // Workaround for xmrig-proxy bug https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89
+        s = json_string_value(json_object_get(val, "status"));
+    }
+
+    if (!s) {
+        applog(LOG_ERR, "JSON invalid status");
        return false;
    }

@@ -619,10 +643,31 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
        return false;
    }

+    extensions_decode(res);
+
    return true;
 }


+static void extensions_decode(const json_t *res)
+{
+    json_t *extensions = json_object_get(res, "extensions");
+    if (!extensions || json_array_size(extensions) == 0) {
+        return;
+    }
+
+    size_t index;
+    json_t *value;
+
+    json_array_foreach(extensions, index, value) {
+        const char *s = json_string_value(value);
+        if (s && strcmp(s, "nicehash")) {
+            opt_nicehash = true;
+        }
+    }
+}
+
+
 /**
 * @brief job_decode
 * @param sctx
@@ -630,46 +675,42 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
 * @param work
 * @return
 */
-static bool job_decode(struct stratum_ctx *sctx, const json_t *job) {
-    json_t *tmp = json_object_get(job, "job_id");
-    if (!tmp) {
+static bool job_decode(const json_t *job) {
+    const char *job_id = json_string_value(json_object_get(job, "job_id"));
+    if (!job_id || strlen(job_id) >= sizeof(work.job_id)) {
        applog(LOG_ERR, "JSON invalid job id");
        return false;
    }

-    const char *job_id = json_string_value(tmp);
-    tmp = json_object_get(job, "blob");
-    if (!tmp) {
+    const char *blob = json_string_value(json_object_get(job, "blob"));
+    if (!blob) {
        applog(LOG_ERR, "JSON invalid blob");
        return false;
    }

-    const char *hexblob = json_string_value(tmp);
-    if (!hexblob || strlen(hexblob) != 152) {
+    work.blob_size = strlen(blob);
+    if (work.blob_size % 2 != 0) {
        applog(LOG_ERR, "JSON invalid blob length");
        return false;
    }

-    if (!hex2bin(sctx->blob, hexblob, 76)) {
-        applog(LOG_ERR, "JSON inval blob");
+    work.blob_size /= 2;
+    if (work.blob_size < 76 || work.blob_size > (sizeof(work.blob))) {
+        applog(LOG_ERR, "JSON invalid blob length");
        return false;
    }

-    uint32_t target;
-    jobj_binary(job, "target", &target, 4);
-
-    if (sctx->target != target) {
-        stats_set_target(target);
-        sctx->target = target;
+    if (!hex2bin((unsigned char *) work.blob, blob, work.blob_size)) {
+        applog(LOG_ERR, "JSON invalid blob");
+        return false;
    }

-    memcpy(sctx->work.data, sctx->blob, 76);
-    memset(sctx->work.target, 0xff, sizeof(sctx->work.target));
+    jobj_binary(job, "target", &work.target, 4);

-    sctx->work.target[7] = sctx->target;
+    memset(work.job_id, 0, sizeof(work.job_id));
+    memcpy(work.job_id, job_id, strlen(job_id));

-    free(sctx->work.job_id);
-    sctx->work.job_id = strdup(job_id);
+    work.height = (uint64_t) json_integer_value(json_object_get(job, "height"));

    return true;
 }
@@ -699,6 +740,7 @@ static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t bu
        return false;
    }

+
    if (!hex2bin(buf, hexstr, buflen)) {
        return false;
    }
--- a/stratum.h
+++ b/stratum.h
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -21,22 +22,28 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __STRATUM_H__
-#define __STRATUM_H__
+#ifndef XMRIG_STRATUM_H
+#define XMRIG_STRATUM_H
+

 #include <stdbool.h>
 #include <inttypes.h>
 #include <curl/curl.h>


+/**
+ * 128tx exploit.
+ *
+ * Max blob size is 84 (75 fixed + 9 variable), aligned to 96.
+ * https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk.
+ */
 struct work {
-    uint32_t data[19];
-    uint32_t target[8];
-    uint32_t hash[8];
-
-    char *job_id;
-    size_t xnonce2_len;
-    unsigned char *xnonce2;
+    uint32_t blob[21] __attribute__((aligned(16)));
+    size_t blob_size  __attribute__((aligned(16)));
+    uint32_t target   __attribute__((aligned(16)));
+    uint32_t hash[8]  __attribute__((aligned(16)));
+    char job_id[64]   __attribute__((aligned(16)));
+    uint64_t height;
 };


@@ -53,8 +60,6 @@ struct stratum_ctx {
    bool ready;

    char id[64];
-    char blob[76];
-    uint32_t target;

    struct work work;
    struct work g_work;
@@ -73,4 +78,4 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 bool stratum_handle_response(char *buf);
 bool stratum_keepalived(struct stratum_ctx *sctx);

-#endif /* __STRATUM_H__ */
+#endif /* XMRIG_STRATUM_H */
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,4 +4,6 @@ cmake_minimum_required(VERSION 3.0)
 include(CTest)

 add_subdirectory(unity)
-add_subdirectory(cryptonight)
+add_subdirectory(cryptonight)
+add_subdirectory(cryptonight_lite)
+add_subdirectory(autoconf)
--- a/test/autoconf/CMakeLists.txt
+++ b/test/autoconf/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(SOURCES
+    autoconf.c
+    ../../cpu.h
+    ../../cpu.c
+   )
+
+add_executable(autoconf_app ${SOURCES})
+target_link_libraries(autoconf_app unity)
+
+include_directories(../..)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-strict-aliasing")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
+add_definitions(-DBUILD_TEST)
+
+add_test(autoconf_test autoconf_app)
--- a/test/autoconf/autoconf.c
+++ b/test/autoconf/autoconf.c
@@ -0,0 +1,152 @@
+#include <unity.h>
+
+#include "cpu.h"
+#include "options.h"
+
+struct cpu_info cpu_info = { 0 };
+
+
+static void set_cpu_info(int total_logical_cpus, int l2_cache, int l3_cache) {
+    cpu_info.total_cores        = total_logical_cpus;
+    cpu_info.total_logical_cpus = total_logical_cpus;
+    cpu_info.l2_cache           = l2_cache;
+    cpu_info.l3_cache           = l3_cache;
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCounti7(void) {
+    set_cpu_info(8, 1024, 8192); // 4C/8T 8 MB (Generic i7 CPU)
+
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(2, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(8, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(6, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(5, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 60));
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 50));
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 35));
+    TEST_ASSERT_EQUAL_INT(2, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 20));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 5));
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCounti5(void) {
+    set_cpu_info(4, 1024, 6144); // 2C/4T 6 MB (Generic i5 CPU)
+
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 75));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 75));
+
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 75));
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCounti3(void) {
+    set_cpu_info(4, 512, 3072); // 2C/4T 3 MB (Generic i3 CPU)
+
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 75));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 75));
+
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 75));
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCountR7(void) {
+    set_cpu_info(16, 4096, 16384); // 8C/16T 16 MB (AMD Ryzen 7)
+
+    TEST_ASSERT_EQUAL_INT(8, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(8, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 75));
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 75));
+
+    TEST_ASSERT_EQUAL_INT(16, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(8, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(12, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(8, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 75));
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCountTwoE5620(void) {
+    set_cpu_info(16, 2048, 24576); // 8C/16T 24 MB (Two E5620)
+
+    TEST_ASSERT_EQUAL_INT(12, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(6, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(12, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 75));
+    TEST_ASSERT_EQUAL_INT(6, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 75));
+
+    TEST_ASSERT_EQUAL_INT(16, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(12, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(12, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(12, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 75));
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCountVCPU(void) {
+    set_cpu_info(1, 1024, 15360); // 1C/1T 15 MB (Single core Virtual CPU)
+
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 75));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 75));
+
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 75));
+}
+
+
+void test_autoconf_should_GetOptimalThreadsCountNoL3(void) {
+    set_cpu_info(8, 8192, 0); // 4C/8T (Multi core Virtual CPU without L3 cache)
+
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT, false, 100));
+    TEST_ASSERT_EQUAL_INT(2, get_optimal_threads_count(ALGO_CRYPTONIGHT, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(8, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 100));
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, true, 100));
+
+    TEST_ASSERT_EQUAL_INT(6, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 75));
+    TEST_ASSERT_EQUAL_INT(5, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 60));
+    TEST_ASSERT_EQUAL_INT(4, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 50));
+    TEST_ASSERT_EQUAL_INT(3, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 35));
+    TEST_ASSERT_EQUAL_INT(2, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 20));
+    TEST_ASSERT_EQUAL_INT(1, get_optimal_threads_count(ALGO_CRYPTONIGHT_LITE, false, 5));
+}
+
+
+int main(void)
+{
+    UNITY_BEGIN();
+
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCounti7);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCounti5);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCounti3);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCountR7);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCountR7);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCountTwoE5620);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCountVCPU);
+    RUN_TEST(test_autoconf_should_GetOptimalThreadsCountNoL3);
+
+    return UNITY_END();
+}
--- a/test/cryptonight/CMakeLists.txt
+++ b/test/cryptonight/CMakeLists.txt
@@ -1,44 +1,28 @@
 set(SOURCES
+    cryptonight.c
+    ../../options.h
    ../../algo/cryptonight/cryptonight.h
-    ../../algo/cryptonight/cryptonight_common.c
-    ../../algo/cryptonight/cryptonight_av4_legacy.c
+    ../../algo/cryptonight/cryptonight.c
+    ../../algo/cryptonight/cryptonight_av1_aesni.c
+    ../../algo/cryptonight/cryptonight_av2_aesni_double.c
+    ../../algo/cryptonight/cryptonight_av3_softaes.c
+    ../../algo/cryptonight/cryptonight_av4_softaes_double.c
    ../../crypto/c_keccak.c
    ../../crypto/c_blake256.c
    ../../crypto/c_groestl.c
    ../../crypto/c_jh.c
    ../../crypto/c_skein.c
-    ../../crypto/oaes_config.h
-    ../../crypto/oaes_lib.h
-    ../../crypto/oaes_lib.c
-    ../../crypto/aesb.c
+    ../../crypto/soft_aes.c
   )

-if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    add_subdirectory(bmi2)
-
-    add_executable(cryptonight_app ${SOURCES}
-        cryptonight.c
-        ../../algo/cryptonight/cryptonight_av1_aesni.c
-        ../../algo/cryptonight/cryptonight_av2_aesni_wolf.c
-        ../../algo/cryptonight/cryptonight_av5_aesni_experimental.c
-    )
-
-    target_link_libraries(cryptonight_app unity cryptonight_av3_aesni_bmi2)
-else()
-    add_executable(cryptonight_app ${SOURCES}
-        cryptonight32.c
-        ../../algo/cryptonight/cryptonight_av1_aesni32.c
-    )
-
-    target_link_libraries(cryptonight_app unity)
-endif()
-
-
+add_executable(cryptonight_app ${SOURCES})
+target_link_libraries(cryptonight_app unity)

 include_directories(../..)

 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -fno-strict-aliasing")
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
 add_definitions(-DBUILD_TEST)
+add_definitions(-DXMRIG_NO_AEON)

 add_test(cryptonight_test cryptonight_app)
--- a/test/cryptonight/bmi2/CMakeLists.txt
+++ b/test/cryptonight/bmi2/CMakeLists.txt
@@ -1,3 +0,0 @@
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -mbmi2")
-include_directories(../../..)
-add_library(cryptonight_av3_aesni_bmi2 STATIC ../../../algo/cryptonight/cryptonight_av3_aesni_bmi2.c)
--- a/test/cryptonight/cryptonight.c
+++ b/test/cryptonight/cryptonight.c
@@ -1,146 +1,125 @@
 #include <unity.h>
 #include <stdbool.h>
 #include <stdlib.h>
-#include <algo/cryptonight/cryptonight.h>
+#include <string.h>
+#include <mm_malloc.h>
+
+#include "options.h"
+#include "algo/cryptonight/cryptonight.h"
+
+bool opt_double_hash = false;
+
+const static char input1[152] = {
+    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
+    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
+    0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
+    0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
+    0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
+    0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
+    0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
+    0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
+    0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
+    0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
+};
+
+const static char input2[] = "This is a test";
+const static char input3[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus pellentesque metus.";


-void cryptonight_av1_aesni(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-void cryptonight_av2_aesni_wolf(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-void cryptonight_av3_aesni_bmi2(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-void cryptonight_av4_legacy(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-void cryptonight_av5_aesni_experimental(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
+void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);


-char *bin2hex(const unsigned char *p, size_t len)
+static char hash[64];
+#define RESULT1        "1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100"
+#define RESULT1_DOUBLE "1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd0801001b606a3f4a07d6489a1bcd07697bd16696b61c8ae982f61a90160f4e52828a7f"
+#define RESULT2        "a084f01d1437a09c6985401b60d43554ae105802c5f5d8a9b3253649c0be6605"
+#define RESULT3        "0bbe54bd26caa92a1d436eec71cbef02560062fa689fe14d7efcf42566b411cf"
+
+
+static char *bin2hex(const unsigned char *p, size_t len)
 {
-    int i;
    char *s = malloc((len * 2) + 1);
-    if (!s)
+    if (!s) {
        return NULL;
+    }

-    for (i = 0; i < len; i++)
+    for (int i = 0; i < len; i++) {
        sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
+    }

    return s;
 }

-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
-{
-    char hex_byte[3];
-    char *ep;

-    hex_byte[2] = '\0';
+static void * create_ctx(int ratio) {
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
+    ctx->memory = (uint8_t *) _mm_malloc(MEMORY * ratio, 16);

-    while (*hexstr && len) {
-        if (!hexstr[1]) {
-            return false;
-        }
-        hex_byte[0] = hexstr[0];
-        hex_byte[1] = hexstr[1];
-        *p = (unsigned char) strtol(hex_byte, &ep, 16);
-        if (*ep) {
-            return false;
-        }
-        p++;
-        hexstr += 2;
-        len--;
-    }
+    return ctx;
+}

-    return (len == 0 && *hexstr == 0) ? true : false;
+
+static void free_ctx(struct cryptonight_ctx *ctx) {
+    _mm_free(ctx->memory);
+    _mm_free(ctx);
 }


 void test_cryptonight_av1_should_CalcHash(void) {
-    char hash[32];
-    char data[76];
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);

-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
+    cryptonight_av1_aesni(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));

-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+    cryptonight_av1_aesni(input2, strlen(input2), &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT2, bin2hex(hash, 32));

-    cryptonight_av1_aesni(&hash, data, memory, ctx);
+    cryptonight_av1_aesni(input3, strlen(input3), &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT3, bin2hex(hash, 32));

-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
+    free_ctx(ctx);
 }


 void test_cryptonight_av2_should_CalcHash(void)
 {
-    char hash[32];
-    char data[76];
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);

-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
+    cryptonight_av2_aesni_double(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));

-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-    cryptonight_av2_aesni_wolf(&hash, data, memory, ctx);
-
-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
+    free_ctx(ctx);
 }


 void test_cryptonight_av3_should_CalcHash(void)
 {
-    char hash[32];
-    char data[76];
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);

-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
+    cryptonight_av3_softaes(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));

-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+    cryptonight_av3_softaes(input2, strlen(input2), &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT2, bin2hex(hash, 32));

-    cryptonight_av3_aesni_bmi2(&hash, data, memory, ctx);
+    cryptonight_av3_softaes(input3, strlen(input3), &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT3, bin2hex(hash, 32));

-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
+    free_ctx(ctx);
 }


 void test_cryptonight_av4_should_CalcHash(void)
 {
-    char hash[32];
-    char data[76];
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);

-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
+    cryptonight_av4_softaes_double(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));

-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-    cryptonight_av4_legacy(&hash, data, memory, ctx);
-
-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
-}
-
-
-void test_cryptonight_av5_should_CalcHash(void)
-{
-    char hash[32];
-    char data[76];
-
-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
-
-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-    cryptonight_av5_aesni_experimental(&hash, data, memory, ctx);
-
-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
+    free_ctx(ctx);
 }


@@ -152,7 +131,6 @@ int main(void)
    RUN_TEST(test_cryptonight_av2_should_CalcHash);
    RUN_TEST(test_cryptonight_av3_should_CalcHash);
    RUN_TEST(test_cryptonight_av4_should_CalcHash);
-    RUN_TEST(test_cryptonight_av5_should_CalcHash);

    return UNITY_END();
 }
--- a/test/cryptonight/cryptonight32.c
+++ b/test/cryptonight/cryptonight32.c
@@ -1,95 +0,0 @@
-#include <unity.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <algo/cryptonight/cryptonight.h>
-
-
-void cryptonight_av1_aesni32(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-void cryptonight_av4_legacy(void* output, const void* input, const char *memory, struct cryptonight_ctx* ctx);
-
-
-char *bin2hex(const unsigned char *p, size_t len)
-{
-    int i;
-    char *s = malloc((len * 2) + 1);
-    if (!s)
-        return NULL;
-
-    for (i = 0; i < len; i++)
-        sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
-
-    return s;
-}
-
-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
-{
-    char hex_byte[3];
-    char *ep;
-
-    hex_byte[2] = '\0';
-
-    while (*hexstr && len) {
-        if (!hexstr[1]) {
-            return false;
-        }
-        hex_byte[0] = hexstr[0];
-        hex_byte[1] = hexstr[1];
-        *p = (unsigned char) strtol(hex_byte, &ep, 16);
-        if (*ep) {
-            return false;
-        }
-        p++;
-        hexstr += 2;
-        len--;
-    }
-
-    return (len == 0 && *hexstr == 0) ? true : false;
-}
-
-
-void test_cryptonight_av1_32_should_CalcHash(void) {
-    char hash[32];
-    char data[76];
-
-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
-
-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-    cryptonight_av1_aesni32(&hash, data, memory, ctx);
-
-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
-}
-
-
-void test_cryptonight_av4_should_CalcHash(void)
-{
-    char hash[32];
-    char data[76];
-
-    hex2bin((unsigned char *) &data, "0305a0dbd6bf05cf16e503f3a66f78007cbf34144332ecbfc22ed95c8700383b309ace1923a0964b00000008ba939a62724c0d7581fce5761e9d8a0e6a1c3f924fdd8493d1115649c05eb601", 76);
-
-    uint8_t *memory = (uint8_t *) malloc(MEMORY);
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-    cryptonight_av4_legacy(&hash, data, memory, ctx);
-
-    free(memory);
-    free(ctx);
-
-    TEST_ASSERT_EQUAL_STRING("1a3ffbee909b420d91f7be6e5fb56db71b3110d886011e877ee5786afd080100", bin2hex(hash, 32));
-}
-
-
-int main(void)
-{
-    UNITY_BEGIN();
-
-    RUN_TEST(test_cryptonight_av1_32_should_CalcHash);
-    RUN_TEST(test_cryptonight_av4_should_CalcHash);
-
-    return UNITY_END();
-}
--- a/test/cryptonight_lite/CMakeLists.txt
+++ b/test/cryptonight_lite/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(SOURCES
+    cryptonight_lite.c
+    ../../options.h
+    ../../algo/cryptonight/cryptonight.h
+    ../../algo/cryptonight/cryptonight.c
+    ../../algo/cryptonight-lite/cryptonight_lite_av1_aesni.c
+    ../../algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c
+    ../../algo/cryptonight-lite/cryptonight_lite_av3_softaes.c
+    ../../algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c
+    ../../crypto/c_keccak.c
+    ../../crypto/c_blake256.c
+    ../../crypto/c_groestl.c
+    ../../crypto/c_jh.c
+    ../../crypto/c_skein.c
+    ../../crypto/soft_aes.c
+   )
+
+add_executable(cryptonight_lite_app ${SOURCES})
+target_link_libraries(cryptonight_lite_app unity)
+
+include_directories(../..)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -fno-strict-aliasing")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
+add_definitions(-DBUILD_TEST)
+
+add_test(cryptonight_lite_test cryptonight_lite_app)
--- a/test/cryptonight_lite/cryptonight_lite.c
+++ b/test/cryptonight_lite/cryptonight_lite.c
@@ -0,0 +1,124 @@
+#include <unity.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mm_malloc.h>
+
+#include "options.h"
+#include "algo/cryptonight/cryptonight.h"
+
+bool opt_double_hash = false;
+enum mining_algo opt_algo = ALGO_CRYPTONIGHT_LITE;
+
+const static char input1[152] = {
+    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
+    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
+    0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
+    0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
+    0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
+    0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
+    0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
+    0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
+    0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
+    0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
+};
+
+
+void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx)          {}
+void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx)   {}
+void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx)        {}
+void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) {}
+
+void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+
+
+static char hash[64];
+#define RESULT1        "3695b4b53bb00358b0ad38dc160feb9e004eece09b83a72ef6ba9864d3510c88"
+#define RESULT1_DOUBLE "3695b4b53bb00358b0ad38dc160feb9e004eece09b83a72ef6ba9864d3510c8828a22bad3f93d1408fca472eb5ad1cbe75f21d053c8ce5b3af105a57713e21dd"
+
+
+static char *bin2hex(const unsigned char *p, size_t len)
+{
+    char *s = malloc((len * 2) + 1);
+    if (!s) {
+        return NULL;
+    }
+
+    for (int i = 0; i < len; i++) {
+        sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
+    }
+
+    return s;
+}
+
+
+static void * create_ctx(int ratio) {
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
+    ctx->memory = (uint8_t *) _mm_malloc(MEMORY_LITE * ratio, 16);
+
+    return ctx;
+}
+
+
+static void free_ctx(struct cryptonight_ctx *ctx) {
+    _mm_free(ctx->memory);
+    _mm_free(ctx);
+}
+
+
+void test_cryptonight_lite_av1_should_CalcHash(void) {
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);
+
+    cryptonight_lite_av1_aesni(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));
+
+    free_ctx(ctx);
+}
+
+
+void test_cryptonight_lite_av2_should_CalcHash(void)
+{
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);
+
+    cryptonight_lite_av2_aesni_double(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));
+
+    free_ctx(ctx);
+}
+
+
+void test_cryptonight_lite_av3_should_CalcHash(void) {
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);
+
+    cryptonight_lite_av3_softaes(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));
+
+    free_ctx(ctx);
+}
+
+
+void test_cryptonight_lite_av4_should_CalcHash(void)
+{
+    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);
+
+    cryptonight_lite_av4_softaes_double(input1, 76, &hash, ctx);
+    TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));
+
+    free_ctx(ctx);
+}
+
+
+int main(void)
+{
+    UNITY_BEGIN();
+
+    RUN_TEST(test_cryptonight_lite_av1_should_CalcHash);
+    RUN_TEST(test_cryptonight_lite_av2_should_CalcHash);
+    RUN_TEST(test_cryptonight_lite_av3_should_CalcHash);
+    RUN_TEST(test_cryptonight_lite_av4_should_CalcHash);
+
+    return UNITY_END();
+}
--- a/unix/cpu_unix.c
+++ b/unix/cpu_unix.c
@@ -33,24 +33,20 @@ void cpu_init_common();


 void cpu_init() {
-    cpu_info.count = sysconf(_SC_NPROCESSORS_CONF);
+#   ifdef XMRIG_NO_LIBCPUID
+    cpu_info.total_logical_cpus = sysconf(_SC_NPROCESSORS_CONF);
+#   endif

    cpu_init_common();
 }


-int get_optimal_threads_count() {
-    int count = cpu_info.count / 2;
-    return count < 1 ? 1 : count;
-}
-
-
 int affine_to_cpu_mask(int id, unsigned long mask)
 {
    cpu_set_t set;
    CPU_ZERO(&set);

-    for (unsigned i = 0; i < cpu_info.count; i++) {
+    for (unsigned i = 0; i < cpu_info.total_logical_cpus; i++) {
        if (mask & (1UL << i)) {
            CPU_SET(i, &set);
        }
--- a/unix/memory_unix.c
+++ b/unix/memory_unix.c
@@ -21,9 +21,6 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __MEMORY_H__
-#define __MEMORY_H__
-
 #include <stdlib.h>
 #include <mm_malloc.h>
 #include <sys/mman.h>
@@ -38,13 +35,14 @@ int persistent_memory_flags = 0;


 const char * persistent_memory_allocate() {
-    const int size = TWO_MB_PAGE * (opt_n_threads + 1);
+    const int ratio = (opt_double_hash && opt_algo != ALGO_CRYPTONIGHT_LITE) ? 2 : 1;
+    const int size = MEMORY * (opt_n_threads * ratio + 1);
    persistent_memory_flags |= MEMORY_HUGEPAGES_AVAILABLE;

    persistent_memory = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);

    if (persistent_memory == MAP_FAILED) {
-        persistent_memory = _mm_malloc(size, 4096);
+        persistent_memory = _mm_malloc(size, 16);
        return persistent_memory;
    }

@@ -63,7 +61,7 @@ const char * persistent_memory_allocate() {


 void persistent_memory_free() {
-    const int size = TWO_MB_PAGE * (opt_n_threads + 1);
+    const int size = MEMORY * (opt_n_threads + 1);

    if (persistent_memory_flags & MEMORY_HUGEPAGES_ENABLED) {
        if (persistent_memory_flags & MEMORY_LOCK) {
@@ -78,4 +76,21 @@ void persistent_memory_free() {
 }


-#endif /* __MEMORY_H__ */
+void *allocate_executable_memory(size_t size)
+{
+    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    mprotect(p, size, PROT_READ | PROT_EXEC);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+#   ifndef __FreeBSD__
+    __builtin___clear_cache((char*) p, (char*)(p) + size);
+#   endif
+}
--- a/utils/applog.c
+++ b/utils/applog.c
@@ -75,6 +75,11 @@ void applog(int prio, const char *fmt, ...)
                prio = LOG_NOTICE;
                color = CL_CYN;
                break;
+
+            case LOG_GREEN:
+                prio = LOG_NOTICE;
+                color = CL_LGR;
+                break;
        }
    }

@@ -116,7 +121,7 @@ void applog_notime(int prio, const char *fmt, ...)
    if (opt_colors) {
        switch (prio) {
            case LOG_ERR:     color = CL_RED; break;
-            case LOG_WARNING: color = CL_YLW; break;
+            case LOG_WARNING: color = CL_LYL; break;
            case LOG_NOTICE:  color = CL_WHT; break;
            case LOG_INFO:    color = ""; break;
            case LOG_DEBUG:   color = CL_GRY; break;
--- a/utils/applog.h
+++ b/utils/applog.h
@@ -30,7 +30,8 @@ enum {
    LOG_NOTICE,
    LOG_INFO,
    LOG_DEBUG,
-    LOG_BLUE = 0x10
+    LOG_BLUE = 0x10,
+    LOG_GREEN
 };

 #define CL_N    "\x1B[0m"
@@ -57,7 +58,7 @@ enum {
 #endif
 #define CL_LRD  "\x1B[01;31m" /* light red */
 #define CL_LGR  "\x1B[01;32m" /* light green */
-#define CL_YL2  "\x1B[01;33m" /* yellow */
+#define CL_LYL  "\x1B[01;33m" /* light yellow */
 #define CL_LBL  "\x1B[01;34m" /* light blue */
 #define CL_LMA  "\x1B[01;35m" /* light magenta */
 #define CL_LCY  "\x1B[01;36m" /* light cyan */
--- a/utils/summary.c
+++ b/utils/summary.c
@@ -44,25 +44,43 @@ static void print_memory() {
 static void print_cpu() {
    const char *t1 = (cpu_info.flags & CPU_FLAG_X86_64) ? OPT_COLOR(CL_LGR, "x86_64") : OPT_COLOR(CL_LRD, "-x86_64");
    const char *t2 = (cpu_info.flags & CPU_FLAG_AES)    ? OPT_COLOR(CL_LGR, "AES-NI") : OPT_COLOR(CL_LRD, "-AES-NI");
-    const char *t3 = (cpu_info.flags & CPU_FLAG_BMI2)   ? OPT_COLOR(CL_LGR, "BMI2")   : OPT_COLOR(CL_LRD, "-BMI2");

    if (opt_colors) {
-        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "CPU:          %s", cpu_info.brand);
-        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "CPU FEATURES: %s %s %s", t1, t2, t3);
+        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "CPU:          %s (%d)", cpu_info.brand, cpu_info.sockets);
    }
    else {
-        applog_notime(LOG_INFO, " * CPU:          %s", cpu_info.brand);
-        applog_notime(LOG_INFO, " * CPU FEATURES: %s %s %s", t1, t2, t3);
+        applog_notime(LOG_INFO, " * CPU:          %s (%d)", cpu_info.brand, cpu_info.sockets);
+    }
+
+ #   ifndef XMRIG_NO_LIBCPUID
+    if (opt_colors) {
+        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "CPU L2/L3:    %.1f MB/%.1f MB", cpu_info.l2_cache / 1024.0, cpu_info.l3_cache / 1024.0);
+    }
+    else {
+        applog_notime(LOG_INFO, " * CPU L2/L3:    %.1f MB/%.1f MB", cpu_info.l2_cache / 1024.0, cpu_info.l3_cache / 1024.0);
+    }
+ #  endif
+
+    if (opt_colors) {
+        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "CPU FEATURES: %s %s", t1, t2);
+    }
+    else {
+        applog_notime(LOG_INFO, " * CPU FEATURES: %s %s", t1, t2);
    }
 }


 static void print_threads() {
+    const char *extra = "";
+    if (opt_nicehash) {
+        extra = ", nicehash";
+    }
+
    if (opt_colors) {
-        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS:      " CL_WHT "%d" CL_WHT ", av=%d, donate=%d%%", opt_n_threads, opt_algo_variant, opt_donate_level);
+        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS:      " CL_WHT "%d" CL_WHT ", av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra);
    }
    else {
-        applog_notime(LOG_INFO, " * THREADS:      %d, av=%d, donate=%d%%", opt_n_threads, opt_algo_variant, opt_donate_level);
+        applog_notime(LOG_INFO, " * THREADS:      %d, av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra);
    }
 }

--- a/version.h
+++ b/version.h
@@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -21,19 +22,20 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __VERSION_H__
-#define __VERSION_H__
+#ifndef XMRIG_VERSION_H
+#define XMRIG_VERSION_H

 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
-#define APP_VERSION   "0.5.0"
+#define APP_DESC      "Monero (XMR) CPU miner"
+#define APP_VERSION   "0.10.0-dev"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
-#define APP_COPYRIGHT "Copyright (C) 2016-2017 xmrig.com"
+#define APP_COPYRIGHT "Copyright (C) 2016-2019 xmrig.com"

 #define APP_VER_MAJOR  0
-#define APP_VER_MINOR  5
+#define APP_VER_MINOR  10
 #define APP_VER_BUILD  0
 #define APP_VER_REV    0

-#endif /* __VERSION_H__ */
+#endif /* XMRIG_VERSION_H */
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
XMRig	d92c1a54de	Fixed macOS build.	2019-03-05 01:07:01 +07:00
XMRig	aa474fa51b	Fix compile warnings.	2019-03-05 00:49:04 +07:00
XMRig	7976059367	Add renaming ASM codes & update from upstream.	2019-03-05 00:41:01 +07:00
XMRig	c5cbd9d8fe	cn/r ASM support for --av 1.	2019-03-04 19:25:59 +07:00
XMRig	ef2e8bed6e	Use new style method to call ASM functions for cn/2 & added bulldozer ASM code.	2019-03-04 13:31:25 +07:00
XMRig	7574bfab60	Added self test for cn/r.	2019-03-04 11:52:38 +07:00
XMRig	27980f24f8	Plain C "cn/r" implementation.	2019-03-03 20:19:17 +07:00
XMRig	5e6a69e16f	Prepare for cn/r.	2019-03-03 14:09:00 +07:00
XMRig	69513e7049	Merge branch 'classic' into classic-dev	2019-03-03 12:05:14 +07:00
XMRig	b834c50aba	Merge branch 'classic-dev' into classic	2018-10-05 16:23:49 +03:00
xmrig	302ebe5a5b	Update CHANGELOG.md	2018-10-05 16:22:16 +03:00
XMRig	b9096f2392	Disable donation.	2018-10-05 16:01:22 +03:00
XMRig	b02f4ff163	Autodetect ASM without libcpuid.	2018-10-05 15:58:33 +03:00
XMRig	11748fad78	Add ASM code.	2018-10-05 15:02:52 +03:00
XMRig	e0dc51edf9	Fixed build without cn-lite.	2018-10-04 22:12:33 +03:00
XMRig	779238fc85	Add support for new style algorithm names.	2018-10-04 22:06:08 +03:00
XMRig	a06a224c0a	Implement --variant option.	2018-10-04 20:27:29 +03:00
XMRig	bf2eb1a685	Fix misaligned access.	2018-10-04 20:11:47 +03:00
XMRig	0bba8849f0	Fix Linux build.	2018-10-04 20:00:18 +03:00
XMRig	1e22a984af	Add double hash cn/2.	2018-10-04 19:25:09 +03:00
XMRig	61b49137c7	Add single hash cn/2.	2018-10-04 18:03:00 +03:00
XMRig	93d072ff6e	Massive refactoring, preparing for cn/2.	2018-10-04 15:52:12 +03:00
XMRig	f0b293f650	Add support for "nicehash" protocol extension.	2018-10-03 01:27:45 +03:00
XMRig	b93e7d9daa	Workaround for xmrig-proxy bug.	2018-10-03 00:41:14 +03:00
XMRig	0b4b07fcd6	v0.9.0-dev	2018-10-03 00:39:45 +03:00
XMRig	af62621169	Fix CURL detection.	2018-10-02 23:58:53 +03:00
XMRig	ed7260449a	v0.8.3	2018-03-11 21:24:55 +07:00
XMRig	33944595a2	Add Monero v7 support.	2018-03-11 21:23:14 +07:00
XMRig	9dc02fc7f3	Fix for `-a cryptonight-light`.	2017-06-06 03:34:49 +03:00
XMRig	6551818610	Update libjansson to 2.10.	2017-06-06 03:31:44 +03:00
Admin	7741c341c7	Huge pages support on OS X.	2017-05-27 10:34:42 +03:00
XMRig	8a70202a98	Fix.	2017-05-27 08:45:11 +03:00
Admin	bc2b7d1895	Initial OS X support.	2017-05-26 23:17:12 +03:00
xmrig	ebb0f81f2f	Update CHANGELOG.md	2017-05-26 09:07:45 +03:00
XMRig	eb3e2b8868	Fix gcc7 support.	2017-05-25 09:32:39 +03:00
XMRig	583d892eb5	Workaround for AMD CPUs https://github.com/anrieff/libcpuid/issues/97	2017-05-24 18:50:24 +03:00
XMRig	b145f14ad8	Merge branch 'dev'	2017-05-20 23:53:16 +03:00
xmrig	ce19edf36c	Update CHANGELOG.md	2017-05-20 23:48:53 +03:00
xmrig	108fd5690e	Update README.md	2017-05-20 23:43:39 +03:00
XMRig	c19fe3cea7	Add "--nicehash" to help output.	2017-05-20 23:38:05 +03:00
XMRig	187c7680cc	Show errors from pool.	2017-05-20 23:27:22 +03:00
XMRig	20061e1b8b	Autodetect nicehash by url	2017-05-20 09:31:02 +03:00
XMRig	2baccab0f9	Initial test nicehash support.	2017-05-20 07:08:41 +03:00
XMRig	44782befea	Fix 32 bit build.	2017-05-16 17:04:27 +03:00
XMRig	5b7a1bc6dc	Merge branch 'dev'	2017-05-15 22:10:45 +03:00
XMRig	e67a95bd8b	Version increment and update help.	2017-05-15 22:06:54 +03:00
xmrig	88dd218ad8	Update README.md	2017-05-15 19:57:20 +03:00
xmrig	ee9ba778f8	Update README.md	2017-05-15 18:32:27 +03:00
xmrig	6080f292e7	Update CHANGELOG.md	2017-05-13 22:39:40 +03:00
xmrig	cf8f81f5fa	Update CHANGELOG.md	2017-05-13 22:32:29 +03:00
xmrig	aab48fde96	Update README.md	2017-05-13 20:31:27 +03:00
xmrig	bf25b4e5d4	Update README.md	2017-05-13 20:26:35 +03:00
XMRig	0c2bda9aa5	Remove default url.	2017-05-13 19:47:12 +03:00
XMRig	d71a15e8da	Use --safe options to disable AES algo variations if CPU not support it.	2017-05-12 15:04:04 +03:00
XMRig	c4bccf410b	* Implement --max-cpu-usage. * Fix L2 cache size detect. * Add test for get_optimal_threads_count.	2017-05-10 19:38:35 +03:00
XMRig	719601f92b	Add test for cryptonight lite.	2017-05-10 15:31:29 +03:00
XMRig	ff7be00f6f	Fix test.	2017-05-10 15:06:01 +03:00
XMRig	d3b0038bda	Add optional CryptoNight-Lite support.	2017-05-10 12:58:52 +03:00
XMRig	3b46f5eb64	Remove BMI2 av.	2017-05-08 23:28:39 +03:00
XMRig	03dbb85c82	Update test values.	2017-05-08 23:06:00 +03:00
XMRig	a2574e1b1b	Added message if huge pages was enabled, but reboot required.	2017-05-08 21:41:27 +03:00
XMRig	15b4244ea8	Added --max-cpu-usage and --safe stub.	2017-05-08 10:29:25 +03:00
XMRig	0dcf127c26	Version increment.	2017-05-06 09:44:50 +03:00
xmrig	9964952c92	Update CHANGELOG.md	2017-05-05 19:54:31 +03:00
XMRig	90648771c0	Fix 32bit build.	2017-05-05 19:51:53 +03:00
XMRig	985adcbc13	No more manual steps to enable huge pages on Windows. XMRig will do it automatically.	2017-05-05 15:49:38 +03:00
XMRig	16f3338e42	Fix crash when use Keepalived.	2017-05-05 10:48:56 +03:00
XMRig	2650545916	Code cleanup.	2017-05-04 14:18:14 +03:00
xmrig	c107547c6c	Update CHANGELOG.md	2017-05-03 15:16:46 +03:00
XMRig	60f7f93408	Merge branch 'feature-libcpuid'	2017-05-03 15:07:09 +03:00
XMRig	dfbfde5b22	Fix Linux build.	2017-05-03 15:03:33 +03:00
XMRig	0c752ee018	Use libcpuid as internal dependence.	2017-05-03 14:36:42 +03:00
XMRig	f329410940	Use libcpuid for detect optimal threads count.	2017-05-03 13:48:08 +03:00
XMRig	0a6d70c499	Add optional libcpuid support to cmake.	2017-05-03 10:53:51 +03:00
XMRig	1678dc1d6d	Implement low power mode (double hash).	2017-05-01 03:49:05 +03:00
XMRig	caf7cda1d5	Backport changes from xmrig-aeon.	2017-04-30 02:56:47 +03:00
XMRig	3de7983826	Fix for donate level.	2017-04-26 18:05:04 +03:00
XMRig	8dda8d293b	Update README.md.	2017-04-25 03:35:03 +03:00
XMRig	e71e9486c6	Remove conflicting declaration for _mulx_u64.	2017-04-25 03:20:32 +03:00
XMRig	b35ecef06f	Move common code to cryptonight_p.h	2017-04-24 13:23:49 +03:00
XMRig	454c78cf0a	Fix const.	2017-04-23 23:56:47 +03:00
XMRig	c97693cd51	Merge branch 'master' of github.com:xmrig/xmrig	2017-04-22 18:08:44 +03:00
XMRig	d855ae2e36	Merge branch 'bug-128tx-exploit'	2017-04-22 18:08:15 +03:00
XMRig	42d2ab18ee	Update tests.	2017-04-22 17:12:50 +03:00
XMRig	97a8d448c0	Pass blob size to cryptonight_hash_ctx.	2017-04-22 15:34:05 +03:00
XMRig	54cef68aa9	Optimize job_decode, support variable length blob and redume mutex lock time.	2017-04-22 13:19:33 +03:00
xmrig	3492670839	Merge pull request #3 from esfomeado/patch-1 More detailed instructions to build on Windows	2017-04-21 18:39:04 +03:00
xmrig	c43c667fed	Update CHANGELOG.md	2017-04-21 18:31:58 +03:00
XMRig	361394be21	Add automatic self test.	2017-04-21 17:29:03 +03:00
XMRig	8235ae0fa6	Add 32 bit support for software AES too.	2017-04-21 15:47:11 +03:00
XMRig	ac89023a79	Add support for 32 bit.	2017-04-21 15:20:08 +03:00
XMRig	f92b5ed9f6	Merge branch 'master' of github.com:xmrig/xmrig	2017-04-21 13:06:29 +03:00
XMRig	7ce21d458a	Version increment.	2017-04-21 13:06:13 +03:00
xmrig	5513fab59b	Update README.md	2017-04-21 12:34:59 +03:00
XMRig	5e6560cb07	Fix affinity for single thread mode.	2017-04-21 12:13:49 +03:00
Esfomeado	25d76626c1	More detailed instructions to build on Windows	2017-04-21 10:08:04 +01:00
XMRig	cad15069c8	Revert back BMI2 support.	2017-04-21 12:05:28 +03:00
XMRig	8ab4c1c8bd	Add memory to cryptonight_ctx.	2017-04-21 11:56:11 +03:00
XMRig	f29d05bdde	Simplify cryptonight_ctx.	2017-04-21 11:14:27 +03:00
XMRig	1474d3fe53	Rename algo variants again, should be final numbers.	2017-04-21 10:40:11 +03:00
XMRig	d2fd43ca03	Change algo variant numbers.	2017-04-21 09:20:19 +03:00
XMRig	95f48fd058	Add app.rc and app.ico for Windows.	2017-04-19 20:52:00 +03:00
XMRig	f8bda3a6b3	Add CHANGELOG.md.	2017-04-19 10:20:04 +03:00
XMRig	21c243ed8f	Much better software AES implementation (--av 4).	2017-04-19 10:03:40 +03:00
XMRig	1013aa5004	Update av1/av6	2017-04-19 07:58:42 +03:00
XMRig	44875b0a94	Fix test.	2017-04-18 17:40:19 +03:00
XMRig	b1f1474438	Merge branch 'feature-xmr-stak-algo'	2017-04-18 16:04:50 +03:00
XMRig	4eb7e5bbfd	Fix stak algo as --av 5, experimental algo now --av 6	2017-04-18 15:57:44 +03:00
XMRig	d874ede49e	Fix.	2017-04-18 13:14:09 +03:00
XMRig	add10c829c	No templates in C :(	2017-04-18 13:10:40 +03:00
XMRig	4acfb213b8	Add xmr-stak-cpu algo as experimental, use --av=5.	2017-04-18 12:06:46 +03:00
xmrig	78a4b9de0f	Update README.md	2017-04-17 05:44:49 +03:00
xmrig	9fe2bbcd81	Update README.md	2017-04-16 16:37:12 +03:00
xmrig	adb778de8a	Update README.md	2017-04-15 10:52:08 +03:00