diff --git a/3rdparty/json-c/CMakeLists.txt b/3rdparty/json-c/CMakeLists.txt
index 63f3053fa00a1ae07251dff47c12e991b087f48e..087c1f19bbd2e2854807660f25f0f5be4633145b 100644
--- a/3rdparty/json-c/CMakeLists.txt
+++ b/3rdparty/json-c/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (json-c)
   
 file(GLOB JSON_C_SRCS FILES *.c)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46fe6516fbdeb4711135751e6b75cf836317687f..2b587186661e77bbb81594532117e12a18d6435a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 project(cellframe-sdk C)
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 
 set(CMAKE_C_STANDARD 11)
 set(CELLFRAME_SDK_NATIVE_VERSION "2.8-19")
@@ -36,7 +36,6 @@ if (ANDROID)
     add_subdirectory(3rdparty/libmagic)
     add_subdirectory(3rdparty/json-c)
     include_directories(cellframe-sdk/3rdparty/)
-
 endif()
 add_subdirectory(modules/)
 
diff --git a/cmake/OS_Detection.cmake b/cmake/OS_Detection.cmake
index 31648a6311e6d6b50add61f0425d049a293eaaff..62eadd1f34d022dfdbd029db873b9435dfb0da4a 100644
--- a/cmake/OS_Detection.cmake
+++ b/cmake/OS_Detection.cmake
@@ -45,25 +45,44 @@ message(STATUS "[*] Building for a ${ARCH_WIDTH}-bit system")
 if(UNIX)
     add_definitions ("-DDAP_OS_UNIX")
     if (APPLE)
-        add_definitions ("-DDAP_OS_DARWIN -DDARWIN")
+        add_definitions ("-DDAP_OS_DARWIN -DDARWIN -DDAP_OS_BSD")
         set(DARWIN ON)
-    else()
-        add_definitions ("-DDAP_OS_LINUX")
+    endif()
+    
+    if (${CMAKE_SYSTEM_NAME} MATCHES "BSD" )
+        add_definitions ("-DDAP_OS_BSD")
+        set(BSD ON)
     endif()
 
-    add_definitions ("-DDAP_OS_LINUX -DDAP_OS_UNIX")
-    # add_definitions ("-DDAP_LOG_MT")
-    if(DAP_DEBUG)
-      set(_CCOPT "-DDAP_DEBUG -Wall -Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-unused-function -Wno-implicit-fallthrough -Wno-unused-variable -Wno-unused-parameter -Wno-unused-but-set-variable -pg -g3 -ggdb -fno-eliminate-unused-debug-symbols -fno-strict-aliasing")
-      set(_LOPT "-pg")
-      SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg")
-    else()
-        set(_CCOPT "-Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-unused-function -Wno-implicit-fallthrough -Wno-unused-variable -Wno-unused-parameter -Wno-unused-but-set-variable -O3 -fPIC -fno-strict-aliasing -fno-ident -ffast-math -ftree-vectorize -fno-asynchronous-unwind-tables -ffunction-sections -Wl,--gc-sections -Wl,--strip-all -std=gnu11")
+    if (${CMAKE_SYSTEM_NAME} MATCHES "Linux" )
+        add_definitions ("-DDAP_OS_LINUX")
+    endif()
+    
+    if (LINUX)
+        if(DAP_DEBUG)
+	  set(_CCOPT "-DDAP_DEBUG -Wall -Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-unused-function -Wno-implicit-fallthrough -Wno-unused-variable -Wno-unused-parameter -Wno-unused-but-set-variable -pg -g3 -ggdb -fno-eliminate-unused-debug-symbols -fno-strict-aliasing")
+          set(_LOPT "-pg")
+	  SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg")
+        else()
+	    set(_CCOPT "-Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-unused-function -Wno-implicit-fallthrough -Wno-unused-variable -Wno-unused-parameter -Wno-unused-but-set-variable -O3 -fPIC -fno-strict-aliasing -fno-ident -ffast-math -ftree-vectorize -fno-asynchronous-unwind-tables -ffunction-sections -Wl,--gc-sections -Wl,--strip-all -std=gnu11")
+        endif()
+    endif()
+    if (BSD)
+        if(DAP_DEBUG)
+	  set(_CCOPT "-I/usr/local/include -DDAP_DEBUG -Wall -Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-unused-function -Wno-implicit-fallthrough -Wno-unused-variable -Wno-unused-parameter -pg -g3 -ggdb -fno-eliminate-unused-debug-symbols -fno-strict-aliasing")
+          set(_LOPT "-pg -L/usr/local/lib ")
+	  SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg")
+        else()
+          set(_CCOPT "-I/usr/local/include -Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-unused-function -Wno-implicit-fallthrough -Wno-unused-variable -Wno-unused-parameter -O3 -fPIC -fno-strict-aliasing -fno-ident -ffast-math -ftree-vectorize -fno-asynchronous-unwind-tables -ffunction-sections -std=gnu11")
+          set(_LOPT "-L /usr/local/lib ")
+	  SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/usr/local/lib")
+        endif()
     endif()
 
     if (ANDROID)
         set(_CCOPT "${_CCOPT} -fforce-enable-int128 -std=gnu11")
         add_definitions ("-DDAP_OS_ANDROID")
+        add_definitions ("-DDAP_OS_LINUX")
     endif()
 
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${_CCOPT}")
diff --git a/dap-sdk/core/include/dap_common.h b/dap-sdk/core/include/dap_common.h
index 4af92ecc78798a0bdb12c0d486bc4bd7e44eb297..ffdb565ef9668107148cd160b08ad994446e7a1f 100755
--- a/dap-sdk/core/include/dap_common.h
+++ b/dap-sdk/core/include/dap_common.h
@@ -483,6 +483,11 @@ void dap_usleep(time_t a_microseconds);
  */
 char* dap_ctime_r(time_t *a_time, char* a_buf);
 
+static inline void * dap_mempcpy(void * a_dest,const void * a_src,size_t n)
+{
+    return ((byte_t*) memcpy(a_dest,a_src,n))+n;
+}
+
 
 
 #ifdef __MINGW32__
diff --git a/dap-sdk/core/include/portable_endian.h b/dap-sdk/core/include/portable_endian.h
index 31c0809d913194de891b8498dc9a1a4d03fc7b5b..48134cb7ebd0a9e47150467a6512e4ea4bb019dc 100644
--- a/dap-sdk/core/include/portable_endian.h
+++ b/dap-sdk/core/include/portable_endian.h
@@ -50,14 +50,29 @@
 
 # include <sys/endian.h>
 
+#if !defined(be16toh)
 # define be16toh(x) betoh16(x)
+#endif
+
+#if !defined(le16toh)
 # define le16toh(x) letoh16(x)
+#endif
 
+#if !defined(be32toh)
 # define be32toh(x) betoh32(x)
+#endif
+
+#if !defined(le32toh)
 # define le32toh(x) letoh32(x)
+#endif
 
+#if !defined(be64toh)
 # define be64toh(x) betoh64(x)
+#endif
+
+#if !defined(le64toh)
 # define le64toh(x) letoh64(x)
+#endif
 
 #elif defined(__WINDOWS__)
 
diff --git a/dap-sdk/core/src/dap_fnmatch.c b/dap-sdk/core/src/dap_fnmatch.c
index f89fa536eff2959625980c269d6dd60e5f42d784..f9d70717c69c34dd25c8488d2b7c42e35b11cb49 100644
--- a/dap-sdk/core/src/dap_fnmatch.c
+++ b/dap-sdk/core/src/dap_fnmatch.c
@@ -21,6 +21,8 @@
 #include <string.h>
 #include <stdlib.h>
 #include <stdbool.h>
+
+#include "dap_common.h"
 #include "dap_fnmatch.h"
 
 
@@ -179,7 +181,7 @@ __wcschrnul (const wchar_t *s, wint_t c)
 # endif
 # define STRLEN(S) strlen(S)
 # define STRCAT(D, S) strcat (D, S)
-# define MEMPCPY(D, S, N) mempcpy (D, S, N)
+# define MEMPCPY(D, S, N) dap_mempcpy (D, S, N)
 # define MEMCHR(S, C, N) memchr (S, C, N)
 # define STRCOLL(S1, S2) strcoll (S1, S2)
 # define WIDE_CHAR_VERSION 0
diff --git a/dap-sdk/core/src/unix/dap_process_manager.c b/dap-sdk/core/src/unix/dap_process_manager.c
index e2d825977374cc7f46056d74677ddada73aef97f..2a853e6ee0535f69908632ba783538546f6720ae 100755
--- a/dap-sdk/core/src/unix/dap_process_manager.c
+++ b/dap-sdk/core/src/unix/dap_process_manager.c
@@ -1,4 +1,4 @@
-#ifdef __linux__
+#ifdef DAP_OS_UNIX
 #include <stdio.h>
 #include <sys/types.h>
 #include <signal.h>
diff --git a/dap-sdk/core/src/unix/dap_process_manager.h b/dap-sdk/core/src/unix/dap_process_manager.h
index 29fb0ddcc86e6bbb67acda0bd9f5dda65f3ccb39..e703c8b834ca503d24df0e21d150cbedc5fcfac2 100755
--- a/dap-sdk/core/src/unix/dap_process_manager.h
+++ b/dap-sdk/core/src/unix/dap_process_manager.h
@@ -28,7 +28,7 @@
 extern "C" {
 #endif
 
-#ifdef __linux__
+#ifdef DAP_OS_UNIX
 
 #include <stdbool.h>
 #include <unistd.h>
diff --git a/dap-sdk/crypto/CMakeLists.txt b/dap-sdk/crypto/CMakeLists.txt
index 9e9660289ae127aa4ba19f97814b7a57531a4e62..1896bd9b0cc761639b00564fe2b710050f52b593 100755
--- a/dap-sdk/crypto/CMakeLists.txt
+++ b/dap-sdk/crypto/CMakeLists.txt
@@ -64,13 +64,34 @@ if(WIN32)
 endif()
 
 if(UNIX)
-  if(BUILD_64)
-    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s )
-  else()
-    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
-  endif()
+    if (LINUX)
+	if(BUILD_64)
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s )
+        else()
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+        endif()
+    elseif(APPLE)
+	if(BUILD_64)
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas_Apple.s )
+	else()
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+	endif()
+    elseif(BSD)
+	if(BUILD_64)
+            file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Compact64/KeccakP-1600-compact64.c )
+	else()
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+        endif()
+    else()
+	file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+    endif()
+
 endif()
 
+
+
+
+
 add_library(${PROJECT_NAME} STATIC ${CRYPTO_SRCS} ${XKCP_SRCS} ${XKCP_SRCS2} ${CRYPTO_HEADERS} )
 
 target_include_directories(dap_crypto PRIVATE src/seed src/rand src/iaes src/oaes src/sha3 src/msrln src/defeo_scheme src/sig_bliss src/sig_tesla src/sig_picnic src/sig_dilithium src include)
diff --git a/dap-sdk/crypto/include/dap_crypto_common.h b/dap-sdk/crypto/include/dap_crypto_common.h
index f213209536ab6d98176a59213278a2a295729aec..67a3db6a949c4018aac6630995fb1c421a5db67f 100755
--- a/dap-sdk/crypto/include/dap_crypto_common.h
+++ b/dap-sdk/crypto/include/dap_crypto_common.h
@@ -34,11 +34,14 @@ extern "C" {
     #define OS_TARGET OS_LINUX
 #elif defined(__APPLE__)         // MACOS
     #define OS_TARGET OS_MACOS
+#elif defined (DAP_OS_BSD)
+    #define OS_TARGET_OS_BSD
 #else
     #error -- "Unsupported OS"
 #endif
 
 
+
 // Definition of compiler
 
 #define COMPILER_VC      1
diff --git a/dap-sdk/crypto/src/GOST/block_cipher.c b/dap-sdk/crypto/src/GOST/block_cipher.c
index 93175427f0e22cb99207cc8cf1fec9060090b65b..921d5ae9f781e8d689db7ae3c818e84c7bd6efa8 100644
--- a/dap-sdk/crypto/src/GOST/block_cipher.c
+++ b/dap-sdk/crypto/src/GOST/block_cipher.c
@@ -8,8 +8,6 @@
 #include <memory.h>
 #ifdef __MACH__
 #include <sys/malloc.h>
-#else
-#include <malloc.h>
 #endif
 #include <stdlib.h>
 
diff --git a/dap-sdk/crypto/src/msrln/AMD64/consts.c b/dap-sdk/crypto/src/msrln/AMD64/consts.c
index 3ff24cbb008e951a92acad8dd670056eb9d20455..9d45871ba478a07c8b6133652c9df80ef2946ea0 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/consts.c
+++ b/dap-sdk/crypto/src/msrln/AMD64/consts.c
@@ -1,40 +1,40 @@
-/****************************************************************************************
-* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-*
-*    Copyright (c) Microsoft Corporation. All rights reserved.
-*
-*
-* Abstract: constants for the x64 assembly implementation
-*
-*****************************************************************************************/
-
-#include "../LatticeCrypto_priv.h"
-#include <stdint.h>
-
-
-uint32_t PRIME8x[8]      = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q};
-uint8_t ONE32x[32]       = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
-uint32_t MASK12x8[8]     = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff};
-uint32_t PERM0246[4]     = {0,2,4,6};
-uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6};
-uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7};
-uint64_t PERM0145[4]     = {0,1,4,5};
-uint64_t PERM2367[4]     = {2,3,6,7};
-uint64_t MASK32[4]       = {0xffffffff,0,0xffffffff,0};
-uint64_t MASK42[4]       = {0x3fff0000000,0,0x3fff0000000,0};
-
-uint64_t MASK14_1[4]     = {0x3fff,0,0x3fff,0};
-uint64_t MASK14_2[4]     = {0xFFFC000,0,0xFFFC000,0};
-uint64_t MASK14_3[4]     = {0x3FFF0000000,0,0x3FFF0000000,0};
-uint64_t MASK14_4[4]     = {0xFFFC0000000000,0,0xFFFC0000000000,0};
-
-uint32_t ONE8x[8]        = {1,1,1,1,1,1,1,1};
-uint32_t THREE8x[8]      = {3,3,3,3,3,3,3,3};
-uint32_t FOUR8x[8]       = {4,4,4,4,4,4,4,4};
-uint32_t PARAM_Q4x8[8]   = {3073,3073,3073,3073,3073,3073,3073,3073};
-uint32_t PARAM_3Q4x8[8]  = {9217,9217,9217,9217,9217,9217,9217,9217};
-uint32_t PARAM_5Q4x8[8]  = {15362,15362,15362,15362,15362,15362,15362,15362};
-uint32_t PARAM_7Q4x8[8]  = {21506,21506,21506,21506,21506,21506,21506,21506};
-uint32_t PARAM_Q2x8[8]   = {6145,6145,6145,6145,6145,6145,6145,6145};
-uint32_t PARAM_3Q2x8[8]  = {18434,18434,18434,18434,18434,18434,18434,18434};
-
+/****************************************************************************************
+* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+*
+*    Copyright (c) Microsoft Corporation. All rights reserved.
+*
+*
+* Abstract: constants for the x64 assembly implementation
+*
+*****************************************************************************************/
+
+#include "../LatticeCrypto_priv.h"
+#include <stdint.h>
+
+
+uint32_t PRIME8x[8]      = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q};
+uint8_t ONE32x[32]       = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+uint32_t MASK12x8[8]     = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff};
+uint32_t PERM0246[4]     = {0,2,4,6};
+uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6};
+uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7};
+uint64_t PERM0145[4]     = {0,1,4,5};
+uint64_t PERM2367[4]     = {2,3,6,7};
+uint64_t MASK32[4]       = {0xffffffff,0,0xffffffff,0};
+uint64_t MASK42[4]       = {0x3fff0000000,0,0x3fff0000000,0};
+
+uint64_t MASK14_1[4]     = {0x3fff,0,0x3fff,0};
+uint64_t MASK14_2[4]     = {0xFFFC000,0,0xFFFC000,0};
+uint64_t MASK14_3[4]     = {0x3FFF0000000,0,0x3FFF0000000,0};
+uint64_t MASK14_4[4]     = {0xFFFC0000000000,0,0xFFFC0000000000,0};
+
+uint32_t ONE8x[8]        = {1,1,1,1,1,1,1,1};
+uint32_t THREE8x[8]      = {3,3,3,3,3,3,3,3};
+uint32_t FOUR8x[8]       = {4,4,4,4,4,4,4,4};
+uint32_t PARAM_Q4x8[8]   = {3073,3073,3073,3073,3073,3073,3073,3073};
+uint32_t PARAM_3Q4x8[8]  = {9217,9217,9217,9217,9217,9217,9217,9217};
+uint32_t PARAM_5Q4x8[8]  = {15362,15362,15362,15362,15362,15362,15362,15362};
+uint32_t PARAM_7Q4x8[8]  = {21506,21506,21506,21506,21506,21506,21506,21506};
+uint32_t PARAM_Q2x8[8]   = {6145,6145,6145,6145,6145,6145,6145,6145};
+uint32_t PARAM_3Q2x8[8]  = {18434,18434,18434,18434,18434,18434,18434,18434};
+
diff --git a/dap-sdk/crypto/src/msrln/AMD64/error_asm.S b/dap-sdk/crypto/src/msrln/AMD64/error_asm.S
index 828816af045a8646212b5818a39cc6b7884a447f..836e47d8d74c02c78cfdc03611b64c9820c9e659 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/error_asm.S
+++ b/dap-sdk/crypto/src/msrln/AMD64/error_asm.S
@@ -1,436 +1,436 @@
-//****************************************************************************************
-// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-//
-//    Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 
-//           vector instructions for Linux 
-//
-//****************************************************************************************  
-
-.intel_syntax noprefix 
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-#define reg_p4  rcx
-#define reg_p5  r8
-
-
-.text
-//***********************************************************************
-//  Error sampling from psi_12
-//  Operation: c [reg_p2] <- sampling(a) [reg_p1]
-//*********************************************************************** 
-.global error_sampling_asm
-error_sampling_asm:  
-  vmovdqu    ymm7, ONE32x 
-  movq       r11, 384
-  movq       r10, 32
-  movq       r8, 24
-  xor        rax, rax
-  xor        rcx, rcx
-loop1:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // sample
-  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+32]     // sample
-  vmovdqu    ymm4, YMMWORD PTR [reg_p1+4*rax+64]     // sample
-  movq       r9, 2
-
-loop1b:
-  vpand      ymm1, ymm0, ymm7                        // Collecting 8 bits for first sample
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  
-  vpand      ymm3, ymm2, ymm7                        // Adding next 4 bits
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  
-  vpsrlw     ymm2, ymm2, 1                           // Collecting 4-bits for second sample
-  vpand      ymm5, ymm2, ymm7
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  
-  vpand      ymm3, ymm4, ymm7                        // Adding next 8 bits
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-
-  vpsubb     ymm5, ymm1, ymm5
-  vpermq     ymm3, ymm5, 0x0e 
-  vpmovsxbd  ymm6, xmm5
-  vpsrldq    ymm5, ymm5, 8 
-  vpmovsxbd  ymm7, xmm5 
-  vpmovsxbd  ymm8, xmm3
-  vpsrldq    ymm3, ymm3, 8 
-  vpmovsxbd  ymm9, xmm3
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx], ymm6
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+32], ymm7
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+64], ymm8
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+96], ymm9
-  
-  add        rcx, r10        // i+32
-  vpsrlw     ymm0, ymm0, 1 
-  vpsrlw     ymm2, ymm2, 1 
-  vpsrlw     ymm4, ymm4, 1 
-  dec        r9
-  jnz        loop1b
-        
-  add        rax, r8         // j+24        
-  cmp        rax, r11
-  jl         loop1
-  ret
-
-
-//***********************************************************************
-//  Reconciliation helper function
-//  Operation: c [reg_p2] <- function(a) [reg_p1]
-//             [reg_p3] points to random bits
-//*********************************************************************** 
-.global helprec_asm
-helprec_asm:  
-  vmovdqu    ymm8, ONE8x 
-  movq       r11, 256
-  movq       r10, 8
-  xor        rax, rax
-  vmovdqu    ymm4, YMMWORD PTR [reg_p3]              // rbits
-loop2:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
-  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
-  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
-
-  vpand      ymm5, ymm4, ymm8                        // Collecting 8 random bits
-  vpslld     ymm0, ymm0, 1                           // 2*x - rbits
-  vpslld     ymm1, ymm1, 1 
-  vpslld     ymm2, ymm2, 1 
-  vpslld     ymm3, ymm3, 1 
-  vpsubd     ymm0, ymm0, ymm5
-  vpsubd     ymm1, ymm1, ymm5
-  vpsubd     ymm2, ymm2, ymm5
-  vpsubd     ymm3, ymm3, ymm5
-    
-  vmovdqu    ymm15, PARAM_Q4x8 
-  vmovdqu    ymm7, FOUR8x
-  vmovdqu    ymm8, ymm7
-  vmovdqu    ymm9, ymm7
-  vmovdqu    ymm10, ymm7
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6
-  vmovdqu    ymm15, PARAM_3Q4x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6
-  vmovdqu    ymm15, PARAM_5Q4x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6
-  vmovdqu    ymm15, PARAM_7Q4x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6                        // v0[0]
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6                        // v0[1]
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6                        // v0[2]
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6                      // v0[3]  
-    
-  vmovdqu    ymm15, PARAM_Q2x8 
-  vmovdqu    ymm11, THREE8x
-  vmovdqu    ymm12, ymm11
-  vmovdqu    ymm13, ymm11
-  vmovdqu    ymm14, ymm11
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm11, ymm11, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm12, ymm12, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm13, ymm13, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm14, ymm14, ymm6
-  vmovdqu    ymm15, PARAM_3Q2x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm11, ymm11, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm12, ymm12, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm13, ymm13, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm14, ymm14, ymm6
-  vmovdqu    ymm15, PRIME8x  
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm11, ymm11, ymm6                      // v1[0]
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm12, ymm12, ymm6                      // v1[1]
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm13, ymm13, ymm6                      // v1[2]
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm14, ymm14, ymm6                      // v1[3]
-
-  vpmulld    ymm6, ymm7, ymm15 
-  vpslld     ymm0, ymm0, 1 
-  vpsubd     ymm0, ymm0, ymm6
-  vpabsd     ymm0, ymm0
-  vpmulld    ymm6, ymm8, ymm15 
-  vpslld     ymm1, ymm1, 1 
-  vpsubd     ymm1, ymm1, ymm6
-  vpabsd     ymm1, ymm1
-  vpaddd     ymm0, ymm0, ymm1
-  vpmulld    ymm6, ymm9, ymm15 
-  vpslld     ymm2, ymm2, 1 
-  vpsubd     ymm2, ymm2, ymm6
-  vpabsd     ymm2, ymm2
-  vpaddd     ymm0, ymm0, ymm2
-  vpmulld    ymm6, ymm10, ymm15 
-  vpslld     ymm3, ymm3, 1 
-  vpsubd     ymm3, ymm3, ymm6
-  vpabsd     ymm3, ymm3
-  vpaddd     ymm0, ymm0, ymm3                        // norm
-  vpsubd     ymm0, ymm0, ymm15
-  vpsrad     ymm0, ymm0, 31                          // If norm < q then norm = 0xff...ff, else norm = 0
-  
-  vpxor      ymm7, ymm7, ymm11                       // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i]
-  vpand      ymm7, ymm7, ymm0
-  vpxor      ymm7, ymm7, ymm11
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm8, ymm8, ymm0
-  vpxor      ymm8, ymm8, ymm12
-  vpxor      ymm9, ymm9, ymm13
-  vpand      ymm9, ymm9, ymm0
-  vpxor      ymm9, ymm9, ymm13
-  vpxor      ymm10, ymm10, ymm14
-  vpand      ymm10, ymm10, ymm0
-  vpxor      ymm10, ymm10, ymm14
-  
-  vmovdqu    ymm15, THREE8x
-  vmovdqu    ymm14, ONE8x
-  vpsubd     ymm7, ymm7, ymm10
-  vpand      ymm7, ymm7, ymm15
-  vpsubd     ymm8, ymm8, ymm10
-  vpand      ymm8, ymm8, ymm15
-  vpsubd     ymm9, ymm9, ymm10
-  vpand      ymm9, ymm9, ymm15 
-  vpslld     ymm10, ymm10, 1 
-  vpxor      ymm0, ymm0, ymm14
-  vpand      ymm0, ymm0, ymm14
-  vpaddd     ymm10, ymm0, ymm10
-  vpand      ymm10, ymm10, ymm15 
-  
-  vpsrld     ymm4, ymm4, 1 
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm7
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*256], ymm8
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*512], ymm9
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*768], ymm10
-
-  add        rax, r10             // j+8 
-  add        rcx, r9
-  cmp        rax, r11             
-  jl         loop2
-  ret
-
-
-//***********************************************************************
-//  Reconciliation function
-//  Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2])
-//*********************************************************************** 
-.global rec_asm
-rec_asm:  
-  vpxor      ymm12, ymm12, ymm12 
-  vmovdqu    ymm15, PRIME8x   
-  vpslld     ymm14, ymm15, 2                         // 4*Q  
-  vpslld     ymm13, ymm15, 3                         // 8*Q
-  vpsubd     ymm12, ymm12, ymm13                     // -8*Q
-  vpxor      ymm11, ymm12, ymm13                     // 8*Q ^ -8*Q
-  vmovdqu    ymm10, ONE8x 
-  movq       r11, 256
-  movq       r10, 8
-  xor        rax, rax
-  xor        rcx, rcx
-loop3:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
-  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
-  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
-  vmovdqu    ymm4, YMMWORD PTR [reg_p2+4*rax]        // rvec
-  vmovdqu    ymm5, YMMWORD PTR [reg_p2+4*rax+4*256]  // rvec+256
-  vmovdqu    ymm6, YMMWORD PTR [reg_p2+4*rax+4*512]  // rvec+512
-  vmovdqu    ymm7, YMMWORD PTR [reg_p2+4*rax+4*768]  // rvec+768
-  
-  vpslld     ymm8, ymm4, 1                           // 2*rvec + rvec
-  vpaddd     ymm4, ymm7, ymm8
-  vpslld     ymm8, ymm5, 1 
-  vpaddd     ymm5, ymm7, ymm8
-  vpslld     ymm8, ymm6, 1 
-  vpaddd     ymm6, ymm7, ymm8
-  vpmulld    ymm4, ymm4, ymm15
-  vpmulld    ymm5, ymm5, ymm15
-  vpmulld    ymm6, ymm6, ymm15
-  vpmulld    ymm7, ymm7, ymm15
-  vpslld     ymm0, ymm0, 3                           // 8*x
-  vpslld     ymm1, ymm1, 3 
-  vpslld     ymm2, ymm2, 3 
-  vpslld     ymm3, ymm3, 3 
-  vpsubd     ymm0, ymm0, ymm4                        // t[i]
-  vpsubd     ymm1, ymm1, ymm5
-  vpsubd     ymm2, ymm2, ymm6
-  vpsubd     ymm3, ymm3, ymm7
-  
-  vpsrad     ymm8, ymm0, 31                          // mask1
-  vpabsd     ymm4, ymm0
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm0, ymm0, ymm4
-  vpabsd     ymm0, ymm0  
-  vpsrad     ymm8, ymm1, 31                          // mask1
-  vpabsd     ymm4, ymm1
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm1, ymm1, ymm4
-  vpabsd     ymm1, ymm1
-  vpaddd     ymm0, ymm0, ymm1
-  vpsrad     ymm8, ymm2, 31                          // mask1
-  vpabsd     ymm4, ymm2
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm2, ymm2, ymm4
-  vpabsd     ymm2, ymm2
-  vpaddd     ymm0, ymm0, ymm2
-  vpsrad     ymm8, ymm3, 31                          // mask1
-  vpabsd     ymm4, ymm3
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm3, ymm3, ymm4
-  vpabsd     ymm3, ymm3
-  vpaddd     ymm0, ymm0, ymm3                        // norm
-
-  vpsubd     ymm0, ymm13, ymm0                       // If norm < PARAMETER_Q then result = 1, else result = 0
-  vpsrld     ymm0, ymm0, 31                            
-  vpxor      ymm0, ymm0, ymm10
-
-  vpsrlq     ymm1, ymm0, 31
-  vpor       ymm1, ymm0, ymm1 
-  vpsllq     ymm2, ymm1, 2
-  vpsrldq    ymm2, ymm2, 8
-  vpor       ymm1, ymm2, ymm1 
-  vpsllq     ymm2, ymm1, 4
-  vpermq     ymm2, ymm2, 0x56
-  vpor       ymm0, ymm1, ymm2 
-  vmovq      r9, xmm0
-  
-  mov        BYTE PTR [reg_p3+rcx], r9b
-
-  add        rax, r10             // j+8 
-  inc        rcx
-  cmp        rax, r11             
-  jl         loop3
-  ret
+//****************************************************************************************
+// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+//
+//    Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 
+//           vector instructions for Linux 
+//
+//****************************************************************************************  
+
+.intel_syntax noprefix 
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+#define reg_p4  rcx
+#define reg_p5  r8
+
+
+.text
+//***********************************************************************
+//  Error sampling from psi_12
+//  Operation: c [reg_p2] <- sampling(a) [reg_p1]
+//*********************************************************************** 
+.global error_sampling_asm
+error_sampling_asm:  
+  vmovdqu    ymm7, ONE32x 
+  movq       r11, 384
+  movq       r10, 32
+  movq       r8, 24
+  xor        rax, rax
+  xor        rcx, rcx
+loop1:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // sample
+  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+32]     // sample
+  vmovdqu    ymm4, YMMWORD PTR [reg_p1+4*rax+64]     // sample
+  movq       r9, 2
+
+loop1b:
+  vpand      ymm1, ymm0, ymm7                        // Collecting 8 bits for first sample
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  
+  vpand      ymm3, ymm2, ymm7                        // Adding next 4 bits
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  
+  vpsrlw     ymm2, ymm2, 1                           // Collecting 4-bits for second sample
+  vpand      ymm5, ymm2, ymm7
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  
+  vpand      ymm3, ymm4, ymm7                        // Adding next 8 bits
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+
+  vpsubb     ymm5, ymm1, ymm5
+  vpermq     ymm3, ymm5, 0x0e 
+  vpmovsxbd  ymm6, xmm5
+  vpsrldq    ymm5, ymm5, 8 
+  vpmovsxbd  ymm7, xmm5 
+  vpmovsxbd  ymm8, xmm3
+  vpsrldq    ymm3, ymm3, 8 
+  vpmovsxbd  ymm9, xmm3
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx], ymm6
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+32], ymm7
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+64], ymm8
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+96], ymm9
+  
+  add        rcx, r10        // i+32
+  vpsrlw     ymm0, ymm0, 1 
+  vpsrlw     ymm2, ymm2, 1 
+  vpsrlw     ymm4, ymm4, 1 
+  dec        r9
+  jnz        loop1b
+        
+  add        rax, r8         // j+24        
+  cmp        rax, r11
+  jl         loop1
+  ret
+
+
+//***********************************************************************
+//  Reconciliation helper function
+//  Operation: c [reg_p2] <- function(a) [reg_p1]
+//             [reg_p3] points to random bits
+//*********************************************************************** 
+.global helprec_asm
+helprec_asm:  
+  vmovdqu    ymm8, ONE8x 
+  movq       r11, 256
+  movq       r10, 8
+  xor        rax, rax
+  vmovdqu    ymm4, YMMWORD PTR [reg_p3]              // rbits
+loop2:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
+  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
+  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
+
+  vpand      ymm5, ymm4, ymm8                        // Collecting 8 random bits
+  vpslld     ymm0, ymm0, 1                           // 2*x - rbits
+  vpslld     ymm1, ymm1, 1 
+  vpslld     ymm2, ymm2, 1 
+  vpslld     ymm3, ymm3, 1 
+  vpsubd     ymm0, ymm0, ymm5
+  vpsubd     ymm1, ymm1, ymm5
+  vpsubd     ymm2, ymm2, ymm5
+  vpsubd     ymm3, ymm3, ymm5
+    
+  vmovdqu    ymm15, PARAM_Q4x8 
+  vmovdqu    ymm7, FOUR8x
+  vmovdqu    ymm8, ymm7
+  vmovdqu    ymm9, ymm7
+  vmovdqu    ymm10, ymm7
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6
+  vmovdqu    ymm15, PARAM_3Q4x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6
+  vmovdqu    ymm15, PARAM_5Q4x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6
+  vmovdqu    ymm15, PARAM_7Q4x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6                        // v0[0]
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6                        // v0[1]
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6                        // v0[2]
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6                      // v0[3]  
+    
+  vmovdqu    ymm15, PARAM_Q2x8 
+  vmovdqu    ymm11, THREE8x
+  vmovdqu    ymm12, ymm11
+  vmovdqu    ymm13, ymm11
+  vmovdqu    ymm14, ymm11
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm11, ymm11, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm12, ymm12, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm13, ymm13, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm14, ymm14, ymm6
+  vmovdqu    ymm15, PARAM_3Q2x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm11, ymm11, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm12, ymm12, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm13, ymm13, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm14, ymm14, ymm6
+  vmovdqu    ymm15, PRIME8x  
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm11, ymm11, ymm6                      // v1[0]
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm12, ymm12, ymm6                      // v1[1]
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm13, ymm13, ymm6                      // v1[2]
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm14, ymm14, ymm6                      // v1[3]
+
+  vpmulld    ymm6, ymm7, ymm15 
+  vpslld     ymm0, ymm0, 1 
+  vpsubd     ymm0, ymm0, ymm6
+  vpabsd     ymm0, ymm0
+  vpmulld    ymm6, ymm8, ymm15 
+  vpslld     ymm1, ymm1, 1 
+  vpsubd     ymm1, ymm1, ymm6
+  vpabsd     ymm1, ymm1
+  vpaddd     ymm0, ymm0, ymm1
+  vpmulld    ymm6, ymm9, ymm15 
+  vpslld     ymm2, ymm2, 1 
+  vpsubd     ymm2, ymm2, ymm6
+  vpabsd     ymm2, ymm2
+  vpaddd     ymm0, ymm0, ymm2
+  vpmulld    ymm6, ymm10, ymm15 
+  vpslld     ymm3, ymm3, 1 
+  vpsubd     ymm3, ymm3, ymm6
+  vpabsd     ymm3, ymm3
+  vpaddd     ymm0, ymm0, ymm3                        // norm
+  vpsubd     ymm0, ymm0, ymm15
+  vpsrad     ymm0, ymm0, 31                          // If norm < q then norm = 0xff...ff, else norm = 0
+  
+  vpxor      ymm7, ymm7, ymm11                       // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i]
+  vpand      ymm7, ymm7, ymm0
+  vpxor      ymm7, ymm7, ymm11
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm8, ymm8, ymm0
+  vpxor      ymm8, ymm8, ymm12
+  vpxor      ymm9, ymm9, ymm13
+  vpand      ymm9, ymm9, ymm0
+  vpxor      ymm9, ymm9, ymm13
+  vpxor      ymm10, ymm10, ymm14
+  vpand      ymm10, ymm10, ymm0
+  vpxor      ymm10, ymm10, ymm14
+  
+  vmovdqu    ymm15, THREE8x
+  vmovdqu    ymm14, ONE8x
+  vpsubd     ymm7, ymm7, ymm10
+  vpand      ymm7, ymm7, ymm15
+  vpsubd     ymm8, ymm8, ymm10
+  vpand      ymm8, ymm8, ymm15
+  vpsubd     ymm9, ymm9, ymm10
+  vpand      ymm9, ymm9, ymm15 
+  vpslld     ymm10, ymm10, 1 
+  vpxor      ymm0, ymm0, ymm14
+  vpand      ymm0, ymm0, ymm14
+  vpaddd     ymm10, ymm0, ymm10
+  vpand      ymm10, ymm10, ymm15 
+  
+  vpsrld     ymm4, ymm4, 1 
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm7
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*256], ymm8
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*512], ymm9
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*768], ymm10
+
+  add        rax, r10             // j+8 
+  add        rcx, r9
+  cmp        rax, r11             
+  jl         loop2
+  ret
+
+
+//***********************************************************************
+//  Reconciliation function
+//  Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2])
+//*********************************************************************** 
+.global rec_asm
+rec_asm:  
+  vpxor      ymm12, ymm12, ymm12 
+  vmovdqu    ymm15, PRIME8x   
+  vpslld     ymm14, ymm15, 2                         // 4*Q  
+  vpslld     ymm13, ymm15, 3                         // 8*Q
+  vpsubd     ymm12, ymm12, ymm13                     // -8*Q
+  vpxor      ymm11, ymm12, ymm13                     // 8*Q ^ -8*Q
+  vmovdqu    ymm10, ONE8x 
+  movq       r11, 256
+  movq       r10, 8
+  xor        rax, rax
+  xor        rcx, rcx
+loop3:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
+  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
+  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
+  vmovdqu    ymm4, YMMWORD PTR [reg_p2+4*rax]        // rvec
+  vmovdqu    ymm5, YMMWORD PTR [reg_p2+4*rax+4*256]  // rvec+256
+  vmovdqu    ymm6, YMMWORD PTR [reg_p2+4*rax+4*512]  // rvec+512
+  vmovdqu    ymm7, YMMWORD PTR [reg_p2+4*rax+4*768]  // rvec+768
+  
+  vpslld     ymm8, ymm4, 1                           // 2*rvec + rvec
+  vpaddd     ymm4, ymm7, ymm8
+  vpslld     ymm8, ymm5, 1 
+  vpaddd     ymm5, ymm7, ymm8
+  vpslld     ymm8, ymm6, 1 
+  vpaddd     ymm6, ymm7, ymm8
+  vpmulld    ymm4, ymm4, ymm15
+  vpmulld    ymm5, ymm5, ymm15
+  vpmulld    ymm6, ymm6, ymm15
+  vpmulld    ymm7, ymm7, ymm15
+  vpslld     ymm0, ymm0, 3                           // 8*x
+  vpslld     ymm1, ymm1, 3 
+  vpslld     ymm2, ymm2, 3 
+  vpslld     ymm3, ymm3, 3 
+  vpsubd     ymm0, ymm0, ymm4                        // t[i]
+  vpsubd     ymm1, ymm1, ymm5
+  vpsubd     ymm2, ymm2, ymm6
+  vpsubd     ymm3, ymm3, ymm7
+  
+  vpsrad     ymm8, ymm0, 31                          // mask1
+  vpabsd     ymm4, ymm0
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm0, ymm0, ymm4
+  vpabsd     ymm0, ymm0  
+  vpsrad     ymm8, ymm1, 31                          // mask1
+  vpabsd     ymm4, ymm1
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm1, ymm1, ymm4
+  vpabsd     ymm1, ymm1
+  vpaddd     ymm0, ymm0, ymm1
+  vpsrad     ymm8, ymm2, 31                          // mask1
+  vpabsd     ymm4, ymm2
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm2, ymm2, ymm4
+  vpabsd     ymm2, ymm2
+  vpaddd     ymm0, ymm0, ymm2
+  vpsrad     ymm8, ymm3, 31                          // mask1
+  vpabsd     ymm4, ymm3
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm3, ymm3, ymm4
+  vpabsd     ymm3, ymm3
+  vpaddd     ymm0, ymm0, ymm3                        // norm
+
+  vpsubd     ymm0, ymm13, ymm0                       // If norm < PARAMETER_Q then result = 1, else result = 0
+  vpsrld     ymm0, ymm0, 31                            
+  vpxor      ymm0, ymm0, ymm10
+
+  vpsrlq     ymm1, ymm0, 31
+  vpor       ymm1, ymm0, ymm1 
+  vpsllq     ymm2, ymm1, 2
+  vpsrldq    ymm2, ymm2, 8
+  vpor       ymm1, ymm2, ymm1 
+  vpsllq     ymm2, ymm1, 4
+  vpermq     ymm2, ymm2, 0x56
+  vpor       ymm0, ymm1, ymm2 
+  vmovq      r9, xmm0
+  
+  mov        BYTE PTR [reg_p3+rcx], r9b
+
+  add        rax, r10             // j+8 
+  inc        rcx
+  cmp        rax, r11             
+  jl         loop3
+  ret
diff --git a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c
index ef846a484156c153358367e00781ed20cd7d0968..d39e95e779e7cc59b68ac0d63e38cecbd9ca813c 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c
+++ b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c
@@ -1,65 +1,65 @@
-/****************************************************************************************
-* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-*
-*    Copyright (c) Microsoft Corporation. All rights reserved.
-*
-*
-* Abstract: NTT functions and other low-level operations
-*
-*****************************************************************************************/
-
-#include "../LatticeCrypto_priv.h"
-    
-
-void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N)
-{
-    NTT_CT_std2rev_12289_asm(a, psi_rev, N);
-}
-
-
-void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N)
-{
-    INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N);
-}
-
-
-void two_reduce12289(int32_t* a, unsigned int N)
-{
-    two_reduce12289_asm(a, N);
-}
-
-
-void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N)
-{
-    pmul_asm(a, b, c, N);
-}
-
-
-void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N)
-{
-    pmuladd_asm(a, b, c, d, N);
-}
-
-
-void smul(int32_t* a, int32_t scalar, unsigned int N)
-{
-    unsigned int i; 
-
-    for (i = 0; i < N; i++) {
-        a[i] = a[i]*scalar;
-    }
-}
-
-
-void correction(int32_t* a, int32_t p, unsigned int N)
-{  
-    unsigned int i; 
-    int32_t mask;
-
-    for (i = 0; i < N; i++) {
-        mask = a[i] >> (4*sizeof(int32_t) - 1);
-        a[i] += (p & mask) - p;
-        mask = a[i] >> (4*sizeof(int32_t) - 1);
-        a[i] += (p & mask);
-    }
-}
+/****************************************************************************************
+* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+*
+*    Copyright (c) Microsoft Corporation. All rights reserved.
+*
+*
+* Abstract: NTT functions and other low-level operations
+*
+*****************************************************************************************/
+
+#include "../LatticeCrypto_priv.h"
+    
+
+void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N)
+{
+    NTT_CT_std2rev_12289_asm(a, psi_rev, N);
+}
+
+
+void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N)
+{
+    INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N);
+}
+
+
+void two_reduce12289(int32_t* a, unsigned int N)
+{
+    two_reduce12289_asm(a, N);
+}
+
+
+void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N)
+{
+    pmul_asm(a, b, c, N);
+}
+
+
+void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N)
+{
+    pmuladd_asm(a, b, c, d, N);
+}
+
+
+void smul(int32_t* a, int32_t scalar, unsigned int N)
+{
+    unsigned int i; 
+
+    for (i = 0; i < N; i++) {
+        a[i] = a[i]*scalar;
+    }
+}
+
+
+void correction(int32_t* a, int32_t p, unsigned int N)
+{  
+    unsigned int i; 
+    int32_t mask;
+
+    for (i = 0; i < N; i++) {
+        mask = a[i] >> (4*sizeof(int32_t) - 1);
+        a[i] += (p & mask) - p;
+        mask = a[i] >> (4*sizeof(int32_t) - 1);
+        a[i] += (p & mask);
+    }
+}
diff --git a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S
index e44c90dce0432847f4aa7670d487c35d4b44f593..9e8d89660a54c22cc43bc69e676a0f96b6175e9f 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S
+++ b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S
@@ -1,979 +1,979 @@
-//****************************************************************************************
-// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-//
-//    Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux 
-//
-//****************************************************************************************  
-
-.intel_syntax noprefix 
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-#define reg_p4  rcx
-#define reg_p5  r8
-
-
-.text
-//***********************************************************************
-//  Forward NTT
-//  Operation: a [reg_p1] <- NTT(a) [reg_p1], 
-//             [reg_p2] points to table and 
-//             reg_p3 contains parameter n
-//*********************************************************************** 
-.global NTT_CT_std2rev_12289_asm
-NTT_CT_std2rev_12289_asm:
-  push       r12
-  push       r13
-  push       r14
-
-// Stages m=1 -> m=32
-  mov        r9, 1            // m = 1
-  mov        rax, reg_p3 
-  mov        r12, reg_p3      
-  shr        r12, 4           // n/16
-  vmovdqu    ymm14, MASK12x8
-  vmovdqu    ymm12, PERM0246
-  mov        r14, 16
-  mov        rcx, 11
-loop1:
-  shr        rax, 1           // k = k/2
-  dec        rcx 
-  xor        rdx, rdx         // i = 0
-loop2:
-  mov        r10, rdx
-  mov        r11, rax
-  dec        r11
-  shl        r10, cl          // j1
-  add        r11, r10         // j2
-  mov        r13, r9
-  add        r13, rdx         // m+i
-  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13]   // S
-
-loop3:
-  mov        r13, r10
-  add        r13, rax         // j+k
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r13]    // a[j+k]
-  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k]
-  vpmovsxdq  ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k]
-  vpmovsxdq  ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k]
-  
-  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
-  vpmuldq    ymm3, ymm3, ymm11                   
-  vpmuldq    ymm5, ymm5, ymm11                   
-  vpmuldq    ymm7, ymm7, ymm11   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm1, 1                      // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V   
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm0, ymm12, ymm0 
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm3, 1                      // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm3, ymm2, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm2, ymm2, ymm13                   // a[j] = U + V  
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
-  vpermd     ymm3, ymm12, ymm3 
-  vpermd     ymm2, ymm12, ymm2 
-  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]
-
-  vmovdqu    ymm13, ymm5
-  vpand      ymm5, ymm14, ymm5                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm5, 1                      // 2*c0
-  vpsubd     ymm13, ymm5, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm5, ymm4, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm4, ymm4, ymm13                   // a[j] = U + V  
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
-  vpermd     ymm5, ymm12, ymm5 
-  vpermd     ymm4, ymm12, ymm4 
-  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j]
-
-  vmovdqu    ymm13, ymm7
-  vpand      ymm7, ymm14, ymm7                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm7, 1                      // 2*c0
-  vpsubd     ymm13, ymm7, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm7, ymm6, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm6, ymm6, ymm13                   // a[j] = U + V 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
-  vpermd     ymm6, ymm12, ymm6   
-  vpermd     ymm7, ymm12, ymm7 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
-  
-  add        r10, r14
-  cmp        r10, r11
-  jl         loop3
-  inc        rdx
-  cmp        rdx, r9
-  jl         loop2
-  shl        r9, 1
-  cmp        r9, r12
-  jl         loop1
-   
-// Stage m=64
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-loop4:
-  vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k]
-  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
-  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
-  vpmuldq    ymm3, ymm3, ymm11                   // a[j+k].S
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm1, 1                      // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1 
-  
-  vmovdqu    ymm10, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm10, ymm10, 12                    // c1
-  vpslld     ymm15, ymm3, 1                      // 2*c0
-  vpsubd     ymm10, ymm3, ymm10                  // c0-c1
-  vpaddd     ymm10, ymm10, ymm15                 // V = 3*c0-c1    
-  
-  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V    
-  vpsubd     ymm3, ymm2, ymm10                   // a[j+k] = U - V
-  vpaddd     ymm2, ymm2, ymm10                   // a[j] = U + V 
-  
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm2, ymm12, ymm2 
-  vpermd     ymm3, ymm12, ymm3 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
-  
-  add        r10, r14        // j+16 
-  inc        rdx             // i+1
-  cmp        rdx, r9
-  jl         loop4
-   
-// Stage m=128
-  shl        r9, 1
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-  mov        r13, 8 
-loop6:
-  vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-  vpmuldq    ymm1, ymm1, ymm2                    // a[j+k].S
-  
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm14, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                      // c1
-  vpslld     ymm4, ymm0, 1                       // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                    // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                    // U = 3*c0-c1    
-  
-  vmovdqu    ymm3, ymm1
-  vpand      ymm1, ymm14, ymm1                   // c0
-  vpsrlq     ymm4, ymm3, 24                      // c2
-  vpsrad     ymm3, ymm3, 12                      // xc1
-  vpand      ymm3, ymm14, ymm3                   // c1
-  vpslld     ymm5, ymm1, 3                       // 8*c0
-  vpaddd     ymm4, ymm1, ymm4                    // c0+c2
-  vpaddd     ymm4, ymm4, ymm5                    // 9*c0+c2
-  vpslld     ymm5, ymm3, 1                       // 2*c1
-  vpaddd     ymm1, ymm0, ymm3                    // U+c1
-  vpsubd     ymm0, ymm0, ymm3                    // U-c1
-  vpsubd     ymm4, ymm4, ymm5                    // 9*c0-2*c1+c2
-  vpaddd     ymm0, ymm0, ymm4                    // U+(9*c0-3*c1+c2)
-  vpsubd     ymm1, ymm1, ymm4                    // U-(9*c0-3*c1+c2)
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
-
-  add        r10, r13        // j+8
-  inc        rdx             // i+1
-  cmp        rdx, r9
-  jl         loop6
-
-// Stage m=256 
-  vmovdqu    ymm9, PERM02134657  
-  shl        r9, 1
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-  mov        r14, 32
-loop7:
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256]    // S = psi[m+i]->psi[m+i+3]
-  vpermq     ymm8, ymm2, 0x50   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-  
-  vpermq     ymm8, ymm2, 0xfa   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
-
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16]  // S = psi[m+i]->psi[m+i+3] 
-  vpermq     ymm8, ymm2, 0x50   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
-          
-  vpermq     ymm8, ymm2, 0xfa   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+96]  // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
-         
-  add        r10, r14        // j+32
-  add        rdx, r13        // i+8
-  cmp        rdx, r9
-  jl         loop7
-
-// Stage m=512
-  vmovdqu    ymm9, PERM00224466
-  shl        r9, 1            // m = n/2 
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-  mov        r14, 4
-loop8:
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]  // a[j+k]
-  vpmuldq    ymm3, ymm1, ymm2                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpermd     ymm1, ymm9, ymm1 
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-  
-  add        r10, r13        // j+8
-  add        rdx, r14        // i+4
-  cmp        rdx, r9
-  jl         loop8
-
-  pop        r14
-  pop        r13
-  pop        r12
-  ret
-
-
-//***********************************************************************
-//  Inverse NTT
-//  Operation: a [reg_p1] <- INTT(a) [reg_p1], 
-//             [reg_p2] points to table
-//             reg_p3 and reg_p4 point to constants for scaling and
-//             reg_p5 contains parameter n
-//*********************************************************************** 
-.global INTT_GS_rev2std_12289_asm
-INTT_GS_rev2std_12289_asm:
-  push       r12
-  push       r13
-  push       r14
-  push       r15
-  push       rbx
-
-// Stage m=1024
-  vmovdqu    ymm9, PERM00224466
-  vmovdqu    ymm14, MASK12x8  
-  mov        r12, reg_p5           
-  shr        r12, 1          // n/2 = 512
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-  mov        r13, 8
-  mov        r14, 4
-loop1b:
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]       // V = a[j+k]    
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*512]   // S
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
-  vpermd     ymm1, ymm9, ymm1 
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-
-  add        r10, r13        // j+8
-  add        r15, r14        // i+4
-  cmp        r15, r12
-  jl         loop1b
-  
-// Stage m=512 
-  vmovdqu    ymm9, PERM02134657
-  vmovdqu    ymm13, PERM0145
-  vmovdqu    ymm15, PERM2367   
-  shr        r12, 1          // n/4 = 256
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-  mov        r14, 32
-loop2b:
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256]   // S = psi[m+i]->psi[m+i+3]
-  vpermq     ymm8, ymm2, 0x50   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-  
-  vpermq     ymm8, ymm2, 0xfa   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
-
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] 
-  vpermq     ymm8, ymm2, 0x50   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+64]      // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
-         
-  vpermq     ymm8, ymm2, 0xfa   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+96]      // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
-         
-  add        r10, r14        // j+32
-  add        r15, r13        // i+8
-  cmp        r15, r12
-  jl         loop2b
-     
-// Stage m=256 
-  vmovdqu    ymm12, PERM0246   
-  shr        r12, 1          // n/8 = 128
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-loop3b:
-  vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128]   // S
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16]      // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
-  
-  add        r10, r13        // j+8
-  inc        r15             // i+1
-  cmp        r15, r12
-  jl         loop3b
-     
-// Stage m=128
-  shr        r12, 1          // n/16 = 64
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-  mov        r14, 16 
-loop4b:
-  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64]   // S
-  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+32]     // V = a[j+k]
-  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r10+48]     // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
-  vpsubd     ymm1, ymm0, ymm13                        // U - V
-  vpaddd     ymm0, ymm0, ymm13                        // U + V 
-  vpsubd     ymm3, ymm2, ymm15                        // U - V
-  vpaddd     ymm2, ymm2, ymm15                        // U + V   
-  vpmuldq    ymm1, ymm1, ymm11                        // (U - V).S
-  vpmuldq    ymm3, ymm3, ymm11                        // (U - V).S
-  
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1    
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm3, 1                           // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
-  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
-  
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm2, ymm12, ymm2 
-  vpermd     ymm3, ymm12, ymm3 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
-  
-  add        r10, r14        // j+16 
-  inc        r15             // i+1
-  cmp        r15, r12
-  jl         loop4b
-  
-// Stages m=64 -> m=4  
-  mov        r9, 5            // 5 iterations
-  mov        rax, 8 
-loop5b:
-  shl        rax, 1          // k = 2*k
-  shr        r12, 1          // m/2
-  xor        r15, r15        // i = 0
-  xor        r8, r8        
-loop6b:
-  mov        r10, r8         // Load j1
-  mov        r11, rax
-  dec        r11
-  add        r11, r10        // j2
-  mov        r13, r12
-  add        r13, r15        // m/2+i
-  vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13]         // S
-  mov        rbx, 4
-
-loop7b:
-  mov        r13, r10
-  add        r13, rax         // j+k
-  vpmovsxdq  ymm10, XMMWORD PTR [reg_p1+4*r13]        // V = a[j+k]
-  vpmovsxdq  ymm11, XMMWORD PTR [reg_p1+4*r13+16]     // V = a[j+k]
-  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r13+32]     // V = a[j+k]
-  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r13+48]     // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
-  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]
-  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48]      // U = a[j]
-  
-  vpsubd     ymm1, ymm0, ymm10                        // U - V
-  vpaddd     ymm0, ymm0, ymm10                        // U + V 
-  vpsubd     ymm3, ymm2, ymm11                        // U - V
-  vpaddd     ymm2, ymm2, ymm11                        // U + V 
-  vpsubd     ymm5, ymm4, ymm13                        // U - V
-  vpaddd     ymm4, ymm4, ymm13                        // U + V 
-  vpsubd     ymm7, ymm6, ymm15                        // U - V
-  vpaddd     ymm6, ymm6, ymm15                        // U + V 
-
-  vpmuldq    ymm1, ymm1, ymm9                         // (U - V).S
-  vpmuldq    ymm3, ymm3, ymm9                   
-  vpmuldq    ymm5, ymm5, ymm9                   
-  vpmuldq    ymm7, ymm7, ymm9   
-  
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
-
-  cmp        r9, rbx 
-  jne        skip1
-  vmovdqu    ymm13, ymm0
-  vpand      ymm0, ymm14, ymm0                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm0, 1                           // 2*c0
-  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
-  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1
-skip1:
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm0, ymm12, ymm0 
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm3, 1                           // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
-  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
-
-  cmp        r9, rbx 
-  jne        skip2
-  vmovdqu    ymm13, ymm2
-  vpand      ymm2, ymm14, ymm2                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm2, 1                           // 2*c0
-  vpsubd     ymm13, ymm2, ymm13                       // c0-c1
-  vpaddd     ymm2, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm3, 1                           // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
-  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1
-skip2:
-  vpermd     ymm3, ymm12, ymm3 
-  vpermd     ymm2, ymm12, ymm2 
-
-  vmovdqu    ymm13, ymm5
-  vpand      ymm5, ymm14, ymm5                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm5, 1                           // 2*c0
-  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
-  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
-
-  cmp        r9, rbx 
-  jne        skip3
-  vmovdqu    ymm13, ymm4
-  vpand      ymm4, ymm14, ymm4                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm4, 1                           // 2*c0
-  vpsubd     ymm13, ymm4, ymm13                       // c0-c1
-  vpaddd     ymm4, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm5
-  vpand      ymm5, ymm14, ymm5                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm5, 1                           // 2*c0
-  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
-  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1
-skip3:
-  vpermd     ymm5, ymm12, ymm5 
-  vpermd     ymm4, ymm12, ymm4 
-
-  vmovdqu    ymm13, ymm7
-  vpand      ymm7, ymm14, ymm7                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm7, 1                           // 2*c0
-  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
-  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
-
-  cmp        r9, rbx 
-  jne        skip4
-  vmovdqu    ymm13, ymm6
-  vpand      ymm6, ymm14, ymm6                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm6, 1                           // 2*c0
-  vpsubd     ymm13, ymm6, ymm13                       // c0-c1
-  vpaddd     ymm6, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm7
-  vpand      ymm7, ymm14, ymm7                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm7, 1                           // 2*c0
-  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
-  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1
-skip4:
-  vpermd     ymm7, ymm12, ymm7 
-  vpermd     ymm6, ymm12, ymm6   
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
-  
-  add        r10, r14
-  cmp        r10, r11
-  jl         loop7b
-  mov        rbx, rax
-  shl        rbx, 1          // 2*k
-  add        r8, rbx         // j1+2*k
-  inc        r15
-  cmp        r15, r12
-  jl         loop6b
-  dec        r9
-  jnz        loop5b
-       
-// Scaling step
-  shl        rax, 1          // k = 2*k = 512
-  xor        r10, r10        // j = 0
-  mov        r14, 4 
-  movq       xmm0, reg_p3
-  vbroadcastsd ymm10, xmm0                            // S = omegainv1N_rev
-  movq       xmm0, reg_p4
-  vbroadcastsd ymm11, xmm0                            // T = Ninv
-loop8b:
-  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+4*512]  // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpsubd     ymm1, ymm0, ymm13                        // U - V
-  vpaddd     ymm0, ymm0, ymm13                        // U + V  
-  vpmuldq    ymm1, ymm1, ymm10                        // (U - V).S
-  vpmuldq    ymm0, ymm0, ymm11                        // (U + V).T
-  
-  vmovdqu    ymm13, ymm0
-  vpand      ymm0, ymm14, ymm0                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm0, 1                           // 2*c0
-  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
-  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1    
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
-  
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+4*512], xmm1
-  
-  add        r10, r14        // j+4 
-  cmp        r10, rax
-  jl         loop8b  
-loop9b:
-  pop        rbx
-  pop        r15
-  pop        r14
-  pop        r13
-  pop        r12
-  ret
-
-
-//***********************************************************************
-//  Component-wise multiplication and addition
-//  Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3]
-//             reg_p5 contains parameter n
-//*********************************************************************** 
-.global pmuladd_asm
-pmuladd_asm:
-  vmovdqu    ymm5, PERM0246
-  vmovdqu    ymm6, MASK12x8 
-  xor        rax, rax
-  movq       r11, 4
-lazo2:
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p3+4*rax]   // c
-  vpmuldq    ymm0, ymm1, ymm0 
-  vpaddq     ymm0, ymm2, ymm0                    
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrlq     ymm3, ymm3, 12                     // c1
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1       
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
-
-  vpermd     ymm0, ymm5, ymm0 
-  vmovdqu    XMMWORD PTR [reg_p4+4*rax], xmm0
-
-  add        rax, r11                           // j+4
-  cmp        rax, reg_p5
-  jl         lazo2
-  ret
-
-
-//***********************************************************************
-//  Component-wise multiplication
-//  Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2]
-//             reg_p4 contains parameter n
-//*********************************************************************** 
-.global pmul_asm
-pmul_asm: 
-  vmovdqu    ymm5, PERM0246
-  vmovdqu    ymm6, MASK12x8 
-  xor        rax, rax
-  movq       r11, 4
-lazo3:
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
-  vpmuldq    ymm0, ymm1, ymm0                    
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrlq     ymm3, ymm3, 12                     // c1
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1       
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
-
-  vpermd     ymm0, ymm5, ymm0 
-  vmovdqu    XMMWORD PTR [reg_p3+4*rax], xmm0
-
-  add        rax, r11                           // j+4
-  cmp        rax, reg_p4
-  jl         lazo3
-  ret
-
-
-//***********************************************************************
-//  Two consecutive reductions
-//  Operation: c [reg_p1] <- a [reg_p1]
-//             reg_p2 contains parameter n
-//*********************************************************************** 
-.global two_reduce12289_asm
-two_reduce12289_asm: 
-  vmovdqu    ymm6, MASK12x8 
-  vmovdqu    ymm7, PRIME8x
-  xor        rax, rax
-  movq       r11, 8
-lazo4:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1       
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
-
-  vpsrad     ymm2, ymm0, 31
-  vpand      ymm2, ymm7, ymm2
-  vpaddd     ymm2, ymm0, ymm2
-  vpsubd     ymm0, ymm2, ymm7
-
-  vpsrad     ymm2, ymm0, 31
-  vpand      ymm2, ymm7, ymm2
-  vpaddd     ymm0, ymm0, ymm2
-
-  vmovdqu    YMMWORD PTR [reg_p1+4*rax], ymm0
-
-  add        rax, r11                           // j+8
-  cmp        rax, reg_p2
-  jl         lazo4
-  ret
-
-
-//***********************************************************************
-//  Encoding
-//  Operation: c [reg_p2] <- a [reg_p1]
-//*********************************************************************** 
-.global encode_asm
-encode_asm: 
-  vmovdqu    ymm6, MASK32 
-  vmovdqu    ymm7, MASK42
-  mov        r9, 1024
-  xor        rax, rax
-  xor        r10, r10
-  mov        r11, 14
-  mov        rcx, 8
-lazo5:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
-
-  vpsrlq     ymm1, ymm0, 18  
-  vpsllq     ymm2, ymm0, 4
-  vpand      ymm0, ymm0, ymm6
-  vpsrldq    ymm2, ymm2, 5   
-  vpsrlq     ymm3, ymm1, 4
-  vpand      ymm1, ymm1, ymm6
-  vpand      ymm2, ymm2, ymm7
-  vpsrldq    ymm3, ymm3, 4 
-  vpor       ymm0, ymm0, ymm1
-  vpor       ymm0, ymm0, ymm2 
-  vpor       ymm0, ymm0, ymm3 
-  vpermq     ymm1, ymm0, 0x0e   
-
-  vmovdqu    XMMWORD PTR [reg_p2+r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p2+r10+7], xmm1
-
-  add        r10, r11
-  add        rax, rcx        // j+8
-  cmp        rax, r9
-  jl         lazo5
-  ret
-
-
-//***********************************************************************
-//  Decoding
-//  Operation: c [reg_p2] <- a [reg_p1]
-//*********************************************************************** 
-.global decode_asm
-decode_asm: 
-  vmovdqu    ymm6, MASK14_1 
-  vmovdqu    ymm7, MASK14_2
-  vmovdqu    ymm8, MASK14_3
-  vmovdqu    ymm9, MASK14_4
-  mov        r9, 1024
-  xor        rax, rax
-  xor        r10, r10
-  mov        r11, 14
-  mov        rcx, 8
-lazo6:
-  vmovdqu    xmm0, XMMWORD PTR [reg_p1+r10]
-  vmovdqu    xmm1, XMMWORD PTR [reg_p1+r10+7]
-  vinserti128 ymm0, ymm0, xmm1, 1               
-
-  vpand      ymm1, ymm0, ymm6
-  vpand      ymm2, ymm0, ymm7
-  vpand      ymm3, ymm0, ymm8
-  vpand      ymm4, ymm0, ymm9
-   
-  vpsllq     ymm2, ymm2, 18 
-  vpsllq     ymm3, ymm3, 4
-  vpslldq    ymm3, ymm3, 4 
-  vpsrlq     ymm4, ymm4, 2
-  vpslldq    ymm4, ymm4, 7
-
-  vpor       ymm1, ymm1, ymm2 
-  vpor       ymm1, ymm1, ymm3 
-  vpor       ymm1, ymm1, ymm4 
-  
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm1   
-
-  add        r10, r11
-  add        rax, rcx            // j+8
-  cmp        rax, r9
-  jl         lazo6
+//****************************************************************************************
+// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+//
+//    Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux 
+//
+//****************************************************************************************  
+
+.intel_syntax noprefix 
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+#define reg_p4  rcx
+#define reg_p5  r8
+
+
+.text
+//***********************************************************************
+//  Forward NTT
+//  Operation: a [reg_p1] <- NTT(a) [reg_p1], 
+//             [reg_p2] points to table and 
+//             reg_p3 contains parameter n
+//*********************************************************************** 
+.global NTT_CT_std2rev_12289_asm
+NTT_CT_std2rev_12289_asm:
+  push       r12
+  push       r13
+  push       r14
+
+// Stages m=1 -> m=32
+  mov        r9, 1            // m = 1
+  mov        rax, reg_p3 
+  mov        r12, reg_p3      
+  shr        r12, 4           // n/16
+  vmovdqu    ymm14, MASK12x8
+  vmovdqu    ymm12, PERM0246
+  mov        r14, 16
+  mov        rcx, 11
+loop1:
+  shr        rax, 1           // k = k/2
+  dec        rcx 
+  xor        rdx, rdx         // i = 0
+loop2:
+  mov        r10, rdx
+  mov        r11, rax
+  dec        r11
+  shl        r10, cl          // j1
+  add        r11, r10         // j2
+  mov        r13, r9
+  add        r13, rdx         // m+i
+  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13]   // S
+
+loop3:
+  mov        r13, r10
+  add        r13, rax         // j+k
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r13]    // a[j+k]
+  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k]
+  vpmovsxdq  ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k]
+  vpmovsxdq  ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k]
+  
+  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
+  vpmuldq    ymm3, ymm3, ymm11                   
+  vpmuldq    ymm5, ymm5, ymm11                   
+  vpmuldq    ymm7, ymm7, ymm11   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm1, 1                      // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V   
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm0, ymm12, ymm0 
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm3, 1                      // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm3, ymm2, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm2, ymm2, ymm13                   // a[j] = U + V  
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
+  vpermd     ymm3, ymm12, ymm3 
+  vpermd     ymm2, ymm12, ymm2 
+  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]
+
+  vmovdqu    ymm13, ymm5
+  vpand      ymm5, ymm14, ymm5                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm5, 1                      // 2*c0
+  vpsubd     ymm13, ymm5, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm5, ymm4, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm4, ymm4, ymm13                   // a[j] = U + V  
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
+  vpermd     ymm5, ymm12, ymm5 
+  vpermd     ymm4, ymm12, ymm4 
+  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j]
+
+  vmovdqu    ymm13, ymm7
+  vpand      ymm7, ymm14, ymm7                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm7, 1                      // 2*c0
+  vpsubd     ymm13, ymm7, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm7, ymm6, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm6, ymm6, ymm13                   // a[j] = U + V 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
+  vpermd     ymm6, ymm12, ymm6   
+  vpermd     ymm7, ymm12, ymm7 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
+  
+  add        r10, r14
+  cmp        r10, r11
+  jl         loop3
+  inc        rdx
+  cmp        rdx, r9
+  jl         loop2
+  shl        r9, 1
+  cmp        r9, r12
+  jl         loop1
+   
+// Stage m=64
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+loop4:
+  vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k]
+  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
+  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
+  vpmuldq    ymm3, ymm3, ymm11                   // a[j+k].S
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm1, 1                      // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1 
+  
+  vmovdqu    ymm10, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm10, ymm10, 12                    // c1
+  vpslld     ymm15, ymm3, 1                      // 2*c0
+  vpsubd     ymm10, ymm3, ymm10                  // c0-c1
+  vpaddd     ymm10, ymm10, ymm15                 // V = 3*c0-c1    
+  
+  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V    
+  vpsubd     ymm3, ymm2, ymm10                   // a[j+k] = U - V
+  vpaddd     ymm2, ymm2, ymm10                   // a[j] = U + V 
+  
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm2, ymm12, ymm2 
+  vpermd     ymm3, ymm12, ymm3 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
+  
+  add        r10, r14        // j+16 
+  inc        rdx             // i+1
+  cmp        rdx, r9
+  jl         loop4
+   
+// Stage m=128
+  shl        r9, 1
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+  mov        r13, 8 
+loop6:
+  vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+  vpmuldq    ymm1, ymm1, ymm2                    // a[j+k].S
+  
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm14, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                      // c1
+  vpslld     ymm4, ymm0, 1                       // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                    // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                    // U = 3*c0-c1    
+  
+  vmovdqu    ymm3, ymm1
+  vpand      ymm1, ymm14, ymm1                   // c0
+  vpsrlq     ymm4, ymm3, 24                      // c2
+  vpsrad     ymm3, ymm3, 12                      // xc1
+  vpand      ymm3, ymm14, ymm3                   // c1
+  vpslld     ymm5, ymm1, 3                       // 8*c0
+  vpaddd     ymm4, ymm1, ymm4                    // c0+c2
+  vpaddd     ymm4, ymm4, ymm5                    // 9*c0+c2
+  vpslld     ymm5, ymm3, 1                       // 2*c1
+  vpaddd     ymm1, ymm0, ymm3                    // U+c1
+  vpsubd     ymm0, ymm0, ymm3                    // U-c1
+  vpsubd     ymm4, ymm4, ymm5                    // 9*c0-2*c1+c2
+  vpaddd     ymm0, ymm0, ymm4                    // U+(9*c0-3*c1+c2)
+  vpsubd     ymm1, ymm1, ymm4                    // U-(9*c0-3*c1+c2)
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
+
+  add        r10, r13        // j+8
+  inc        rdx             // i+1
+  cmp        rdx, r9
+  jl         loop6
+
+// Stage m=256 
+  vmovdqu    ymm9, PERM02134657  
+  shl        r9, 1
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+  mov        r14, 32
+loop7:
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256]    // S = psi[m+i]->psi[m+i+3]
+  vpermq     ymm8, ymm2, 0x50   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+  
+  vpermq     ymm8, ymm2, 0xfa   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
+
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16]  // S = psi[m+i]->psi[m+i+3] 
+  vpermq     ymm8, ymm2, 0x50   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
+          
+  vpermq     ymm8, ymm2, 0xfa   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+96]  // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
+         
+  add        r10, r14        // j+32
+  add        rdx, r13        // i+8
+  cmp        rdx, r9
+  jl         loop7
+
+// Stage m=512
+  vmovdqu    ymm9, PERM00224466
+  shl        r9, 1            // m = n/2 
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+  mov        r14, 4
+loop8:
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]  // a[j+k]
+  vpmuldq    ymm3, ymm1, ymm2                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpermd     ymm1, ymm9, ymm1 
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+  
+  add        r10, r13        // j+8
+  add        rdx, r14        // i+4
+  cmp        rdx, r9
+  jl         loop8
+
+  pop        r14
+  pop        r13
+  pop        r12
+  ret
+
+
+//***********************************************************************
+//  Inverse NTT
+//  Operation: a [reg_p1] <- INTT(a) [reg_p1], 
+//             [reg_p2] points to table
+//             reg_p3 and reg_p4 point to constants for scaling and
+//             reg_p5 contains parameter n
+//*********************************************************************** 
+.global INTT_GS_rev2std_12289_asm
+INTT_GS_rev2std_12289_asm:
+  push       r12
+  push       r13
+  push       r14
+  push       r15
+  push       rbx
+
+// Stage m=1024
+  vmovdqu    ymm9, PERM00224466
+  vmovdqu    ymm14, MASK12x8  
+  mov        r12, reg_p5           
+  shr        r12, 1          // n/2 = 512
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+  mov        r13, 8
+  mov        r14, 4
+loop1b:
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]       // V = a[j+k]    
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*512]   // S
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
+  vpermd     ymm1, ymm9, ymm1 
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+
+  add        r10, r13        // j+8
+  add        r15, r14        // i+4
+  cmp        r15, r12
+  jl         loop1b
+  
+// Stage m=512 
+  vmovdqu    ymm9, PERM02134657
+  vmovdqu    ymm13, PERM0145
+  vmovdqu    ymm15, PERM2367   
+  shr        r12, 1          // n/4 = 256
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+  mov        r14, 32
+loop2b:
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256]   // S = psi[m+i]->psi[m+i+3]
+  vpermq     ymm8, ymm2, 0x50   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+  
+  vpermq     ymm8, ymm2, 0xfa   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
+
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] 
+  vpermq     ymm8, ymm2, 0x50   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+64]      // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
+         
+  vpermq     ymm8, ymm2, 0xfa   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+96]      // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
+         
+  add        r10, r14        // j+32
+  add        r15, r13        // i+8
+  cmp        r15, r12
+  jl         loop2b
+     
+// Stage m=256 
+  vmovdqu    ymm12, PERM0246   
+  shr        r12, 1          // n/8 = 128
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+loop3b:
+  vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128]   // S
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16]      // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
+  
+  add        r10, r13        // j+8
+  inc        r15             // i+1
+  cmp        r15, r12
+  jl         loop3b
+     
+// Stage m=128
+  shr        r12, 1          // n/16 = 64
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+  mov        r14, 16 
+loop4b:
+  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64]   // S
+  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+32]     // V = a[j+k]
+  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r10+48]     // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
+  vpsubd     ymm1, ymm0, ymm13                        // U - V
+  vpaddd     ymm0, ymm0, ymm13                        // U + V 
+  vpsubd     ymm3, ymm2, ymm15                        // U - V
+  vpaddd     ymm2, ymm2, ymm15                        // U + V   
+  vpmuldq    ymm1, ymm1, ymm11                        // (U - V).S
+  vpmuldq    ymm3, ymm3, ymm11                        // (U - V).S
+  
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1    
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm3, 1                           // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
+  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
+  
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm2, ymm12, ymm2 
+  vpermd     ymm3, ymm12, ymm3 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
+  
+  add        r10, r14        // j+16 
+  inc        r15             // i+1
+  cmp        r15, r12
+  jl         loop4b
+  
+// Stages m=64 -> m=4  
+  mov        r9, 5            // 5 iterations
+  mov        rax, 8 
+loop5b:
+  shl        rax, 1          // k = 2*k
+  shr        r12, 1          // m/2
+  xor        r15, r15        // i = 0
+  xor        r8, r8        
+loop6b:
+  mov        r10, r8         // Load j1
+  mov        r11, rax
+  dec        r11
+  add        r11, r10        // j2
+  mov        r13, r12
+  add        r13, r15        // m/2+i
+  vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13]         // S
+  mov        rbx, 4
+
+loop7b:
+  mov        r13, r10
+  add        r13, rax         // j+k
+  vpmovsxdq  ymm10, XMMWORD PTR [reg_p1+4*r13]        // V = a[j+k]
+  vpmovsxdq  ymm11, XMMWORD PTR [reg_p1+4*r13+16]     // V = a[j+k]
+  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r13+32]     // V = a[j+k]
+  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r13+48]     // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
+  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]
+  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48]      // U = a[j]
+  
+  vpsubd     ymm1, ymm0, ymm10                        // U - V
+  vpaddd     ymm0, ymm0, ymm10                        // U + V 
+  vpsubd     ymm3, ymm2, ymm11                        // U - V
+  vpaddd     ymm2, ymm2, ymm11                        // U + V 
+  vpsubd     ymm5, ymm4, ymm13                        // U - V
+  vpaddd     ymm4, ymm4, ymm13                        // U + V 
+  vpsubd     ymm7, ymm6, ymm15                        // U - V
+  vpaddd     ymm6, ymm6, ymm15                        // U + V 
+
+  vpmuldq    ymm1, ymm1, ymm9                         // (U - V).S
+  vpmuldq    ymm3, ymm3, ymm9                   
+  vpmuldq    ymm5, ymm5, ymm9                   
+  vpmuldq    ymm7, ymm7, ymm9   
+  
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
+
+  cmp        r9, rbx 
+  jne        skip1
+  vmovdqu    ymm13, ymm0
+  vpand      ymm0, ymm14, ymm0                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm0, 1                           // 2*c0
+  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
+  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1
+skip1:
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm0, ymm12, ymm0 
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm3, 1                           // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
+  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
+
+  cmp        r9, rbx 
+  jne        skip2
+  vmovdqu    ymm13, ymm2
+  vpand      ymm2, ymm14, ymm2                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm2, 1                           // 2*c0
+  vpsubd     ymm13, ymm2, ymm13                       // c0-c1
+  vpaddd     ymm2, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm3, 1                           // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
+  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1
+skip2:
+  vpermd     ymm3, ymm12, ymm3 
+  vpermd     ymm2, ymm12, ymm2 
+
+  vmovdqu    ymm13, ymm5
+  vpand      ymm5, ymm14, ymm5                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm5, 1                           // 2*c0
+  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
+  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
+
+  cmp        r9, rbx 
+  jne        skip3
+  vmovdqu    ymm13, ymm4
+  vpand      ymm4, ymm14, ymm4                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm4, 1                           // 2*c0
+  vpsubd     ymm13, ymm4, ymm13                       // c0-c1
+  vpaddd     ymm4, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm5
+  vpand      ymm5, ymm14, ymm5                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm5, 1                           // 2*c0
+  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
+  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1
+skip3:
+  vpermd     ymm5, ymm12, ymm5 
+  vpermd     ymm4, ymm12, ymm4 
+
+  vmovdqu    ymm13, ymm7
+  vpand      ymm7, ymm14, ymm7                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm7, 1                           // 2*c0
+  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
+  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
+
+  cmp        r9, rbx 
+  jne        skip4
+  vmovdqu    ymm13, ymm6
+  vpand      ymm6, ymm14, ymm6                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm6, 1                           // 2*c0
+  vpsubd     ymm13, ymm6, ymm13                       // c0-c1
+  vpaddd     ymm6, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm7
+  vpand      ymm7, ymm14, ymm7                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm7, 1                           // 2*c0
+  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
+  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1
+skip4:
+  vpermd     ymm7, ymm12, ymm7 
+  vpermd     ymm6, ymm12, ymm6   
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
+  
+  add        r10, r14
+  cmp        r10, r11
+  jl         loop7b
+  mov        rbx, rax
+  shl        rbx, 1          // 2*k
+  add        r8, rbx         // j1+2*k
+  inc        r15
+  cmp        r15, r12
+  jl         loop6b
+  dec        r9
+  jnz        loop5b
+       
+// Scaling step
+  shl        rax, 1          // k = 2*k = 512
+  xor        r10, r10        // j = 0
+  mov        r14, 4 
+  movq       xmm0, reg_p3
+  vbroadcastsd ymm10, xmm0                            // S = omegainv1N_rev
+  movq       xmm0, reg_p4
+  vbroadcastsd ymm11, xmm0                            // T = Ninv
+loop8b:
+  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+4*512]  // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpsubd     ymm1, ymm0, ymm13                        // U - V
+  vpaddd     ymm0, ymm0, ymm13                        // U + V  
+  vpmuldq    ymm1, ymm1, ymm10                        // (U - V).S
+  vpmuldq    ymm0, ymm0, ymm11                        // (U + V).T
+  
+  vmovdqu    ymm13, ymm0
+  vpand      ymm0, ymm14, ymm0                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm0, 1                           // 2*c0
+  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
+  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1    
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
+  
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+4*512], xmm1
+  
+  add        r10, r14        // j+4 
+  cmp        r10, rax
+  jl         loop8b  
+loop9b:
+  pop        rbx
+  pop        r15
+  pop        r14
+  pop        r13
+  pop        r12
+  ret
+
+
+//***********************************************************************
+//  Component-wise multiplication and addition
+//  Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3]
+//             reg_p5 contains parameter n
+//*********************************************************************** 
+.global pmuladd_asm
+pmuladd_asm:
+  vmovdqu    ymm5, PERM0246
+  vmovdqu    ymm6, MASK12x8 
+  xor        rax, rax
+  movq       r11, 4
+lazo2:
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p3+4*rax]   // c
+  vpmuldq    ymm0, ymm1, ymm0 
+  vpaddq     ymm0, ymm2, ymm0                    
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrlq     ymm3, ymm3, 12                     // c1
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1       
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
+
+  vpermd     ymm0, ymm5, ymm0 
+  vmovdqu    XMMWORD PTR [reg_p4+4*rax], xmm0
+
+  add        rax, r11                           // j+4
+  cmp        rax, reg_p5
+  jl         lazo2
+  ret
+
+
+//***********************************************************************
+//  Component-wise multiplication
+//  Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2]
+//             reg_p4 contains parameter n
+//*********************************************************************** 
+.global pmul_asm
+pmul_asm: 
+  vmovdqu    ymm5, PERM0246
+  vmovdqu    ymm6, MASK12x8 
+  xor        rax, rax
+  movq       r11, 4
+lazo3:
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
+  vpmuldq    ymm0, ymm1, ymm0                    
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrlq     ymm3, ymm3, 12                     // c1
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1       
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
+
+  vpermd     ymm0, ymm5, ymm0 
+  vmovdqu    XMMWORD PTR [reg_p3+4*rax], xmm0
+
+  add        rax, r11                           // j+4
+  cmp        rax, reg_p4
+  jl         lazo3
+  ret
+
+
+//***********************************************************************
+//  Two consecutive reductions
+//  Operation: c [reg_p1] <- a [reg_p1]
+//             reg_p2 contains parameter n
+//*********************************************************************** 
+.global two_reduce12289_asm
+two_reduce12289_asm: 
+  vmovdqu    ymm6, MASK12x8 
+  vmovdqu    ymm7, PRIME8x
+  xor        rax, rax
+  movq       r11, 8
+lazo4:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1       
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
+
+  vpsrad     ymm2, ymm0, 31
+  vpand      ymm2, ymm7, ymm2
+  vpaddd     ymm2, ymm0, ymm2
+  vpsubd     ymm0, ymm2, ymm7
+
+  vpsrad     ymm2, ymm0, 31
+  vpand      ymm2, ymm7, ymm2
+  vpaddd     ymm0, ymm0, ymm2
+
+  vmovdqu    YMMWORD PTR [reg_p1+4*rax], ymm0
+
+  add        rax, r11                           // j+8
+  cmp        rax, reg_p2
+  jl         lazo4
+  ret
+
+
+//***********************************************************************
+//  Encoding
+//  Operation: c [reg_p2] <- a [reg_p1]
+//*********************************************************************** 
+.global encode_asm
+encode_asm: 
+  vmovdqu    ymm6, MASK32 
+  vmovdqu    ymm7, MASK42
+  mov        r9, 1024
+  xor        rax, rax
+  xor        r10, r10
+  mov        r11, 14
+  mov        rcx, 8
+lazo5:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
+
+  vpsrlq     ymm1, ymm0, 18  
+  vpsllq     ymm2, ymm0, 4
+  vpand      ymm0, ymm0, ymm6
+  vpsrldq    ymm2, ymm2, 5   
+  vpsrlq     ymm3, ymm1, 4
+  vpand      ymm1, ymm1, ymm6
+  vpand      ymm2, ymm2, ymm7
+  vpsrldq    ymm3, ymm3, 4 
+  vpor       ymm0, ymm0, ymm1
+  vpor       ymm0, ymm0, ymm2 
+  vpor       ymm0, ymm0, ymm3 
+  vpermq     ymm1, ymm0, 0x0e   
+
+  vmovdqu    XMMWORD PTR [reg_p2+r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p2+r10+7], xmm1
+
+  add        r10, r11
+  add        rax, rcx        // j+8
+  cmp        rax, r9
+  jl         lazo5
+  ret
+
+
+//***********************************************************************
+//  Decoding
+//  Operation: c [reg_p2] <- a [reg_p1]
+//*********************************************************************** 
+.global decode_asm
+decode_asm: 
+  vmovdqu    ymm6, MASK14_1 
+  vmovdqu    ymm7, MASK14_2
+  vmovdqu    ymm8, MASK14_3
+  vmovdqu    ymm9, MASK14_4
+  mov        r9, 1024
+  xor        rax, rax
+  xor        r10, r10
+  mov        r11, 14
+  mov        rcx, 8
+lazo6:
+  vmovdqu    xmm0, XMMWORD PTR [reg_p1+r10]
+  vmovdqu    xmm1, XMMWORD PTR [reg_p1+r10+7]
+  vinserti128 ymm0, ymm0, xmm1, 1               
+
+  vpand      ymm1, ymm0, ymm6
+  vpand      ymm2, ymm0, ymm7
+  vpand      ymm3, ymm0, ymm8
+  vpand      ymm4, ymm0, ymm9
+   
+  vpsllq     ymm2, ymm2, 18 
+  vpsllq     ymm3, ymm3, 4
+  vpslldq    ymm3, ymm3, 4 
+  vpsrlq     ymm4, ymm4, 2
+  vpslldq    ymm4, ymm4, 7
+
+  vpor       ymm1, ymm1, ymm2 
+  vpor       ymm1, ymm1, ymm3 
+  vpor       ymm1, ymm1, ymm4 
+  
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm1   
+
+  add        r10, r11
+  add        rax, rcx            // j+8
+  cmp        rax, r9
+  jl         lazo6
   ret
\ No newline at end of file
diff --git a/dap-sdk/crypto/src/msrln/kex.c b/dap-sdk/crypto/src/msrln/kex.c
index e2c6b317ecd19cf9acb0f7ed66be56826b10d9a2..99a8db39624b285185b2af30be2605c682b9359a 100755
--- a/dap-sdk/crypto/src/msrln/kex.c
+++ b/dap-sdk/crypto/src/msrln/kex.c
@@ -1,645 +1,642 @@
-#include "msrln_priv.h"
-#if (OS_TARGET == OS_MACOS)
-    #include <stdio.h>
-#else
-    #include <malloc.h>
-#endif
-
-#include "KeccakHash.h"
-#include "SimpleFIPS202.h"
-
-
-// N^-1 * prime_scale^-8
-const int32_t MSRLN_Ninv8_ntt1024_12289 = 8350;
-// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1]
-const int32_t MSRLN_omegainv7N_rev_ntt1024_12289 = 795;
-// N^-1 * prime_scale^-11
-const int32_t MSRLN_Ninv11_ntt1024_12289 = 2585;
-// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1]
-const int32_t MSRLN_omegainv10N_rev_ntt1024_12289 = 10953;
-
-
-// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy),
-// where xxx is parameter N and yyy is the prime q.
-
-const int32_t MSRLN_psi_rev_ntt1024_12289[1024] = {
-8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201,
-875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859,
-7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626,
-3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042,
-3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563,
-7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266,
-5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934,
-8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842,
-11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541,
-11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969,
-8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863,
-8974, 9551, 5868, 9634, 5735,11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333,
-10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455,7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656,
-64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449,
-4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033,
-8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912,
-1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029,
-6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105,
-8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993,
-3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763,
-125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187,
-3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063,
-4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377,
-4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170,
-10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982,
-6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550,
-8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120,
-6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769,
-4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000,
-5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789,
-10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946,
-6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578,
-7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259,
-3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790,
-5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253,
-4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939
-};
-
-
-const int32_t MSRLN_omegainv_rev_ntt1024_12289[1024] = {
-8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422,
-6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270,
-2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957,
-8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372,
-10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300,
-5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534,
-145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567,
-6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170,
-10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404,
-1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162,
-12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643,
-6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018,
-1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239,
-11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257,
-3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919,
-8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483,
-6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447,
-6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036,
-1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038,
-6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120,
-2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626,
-11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015,
-10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570,
-6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273,
-8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767,
-8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468,
-3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906,
-6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492,
-8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635,
-9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392,
-5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520,
-3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566,
-5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678,
-4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873,
-2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448
-};
-
-
-const int32_t MSRLN_psi_rev_ntt512_12289[512] = {
-8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607,
-4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390,
-11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437,
-12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178,
-1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790,
-2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810,
-1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863,
-10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893,
-7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517,
-4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757,
-9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297,
-6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633,
-12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540,
-5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367,
-8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369,
-9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600,
-3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184
-};
-
-
-const int32_t MSRLN_omegainv_rev_ntt512_12289[512] = {
-8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302,
-8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752,
-12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186,
-11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247,
-9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821,
-11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548,
-4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023,
-2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255,
-11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624,
-9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169,
-3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035,
-3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752,
-4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656,
-2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404,
-325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975,
-10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908,
-11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434
-};
-
-// import external code
-#ifdef RLWE_ASM_AVX2
-    #include "AMD64/consts.c"
-    #include "AMD64/ntt_x64.c"
-#else
-    #include "generic/ntt.c"
-#endif
-
-__inline void clear_words(void* mem, digit_t nwords)
-{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
-  // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing.
-    unsigned int i;
-    volatile digit_t *v = mem; 
-
-    for (i = 0; i < nwords; i++) {
-        v[i] = 0;
-    }
-}
-
-
-CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction)
-{ // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
-
-    pLatticeCrypto->RandomBytesFunction = RandomBytesFunction;
-    pLatticeCrypto->ExtendableOutputFunction = ExtendableOutputFunction;
-    pLatticeCrypto->StreamOutputFunction = StreamOutputFunction;
-
-    return CRYPTO_MSRLN_SUCCESS;
-}
-
-
-PLatticeCryptoStruct LatticeCrypto_allocate()
-{ // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). 
-  // Returns NULL on error.
-    PLatticeCryptoStruct LatticeCrypto = NULL;
-
-    LatticeCrypto = (PLatticeCryptoStruct)calloc(1, sizeof(LatticeCryptoStruct));
-
-    if (LatticeCrypto == NULL) {
-        return NULL;
-    }
-    return LatticeCrypto;
-}
-
-
-const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status)
-{ // Output error/success message for a given CRYPTO_STATUS
-    struct error_mapping {
-        unsigned int index;
-        char*        string;
-    } mapping[CRYPTO_STATUS_TYPE_SIZE] = {
-        {CRYPTO_MSRLN_SUCCESS, CRYPTO_MSG_SUCCESS},
-        {CRYPTO_MSRLN_ERROR, CRYPTO_MSG_ERROR},
-        {CRYPTO_MSRLN_ERROR_DURING_TEST, CRYPTO_MSG_ERROR_DURING_TEST},
-        {CRYPTO_MSRLN_ERROR_UNKNOWN, CRYPTO_MSG_ERROR_UNKNOWN},
-        {CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, CRYPTO_MSG_ERROR_NOT_IMPLEMENTED},
-        {CRYPTO_MSRLN_ERROR_NO_MEMORY, CRYPTO_MSG_ERROR_NO_MEMORY},
-        {CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, CRYPTO_MSG_ERROR_INVALID_PARAMETER},
-        {CRYPTO_MSRLN_ERROR_SHARED_KEY, CRYPTO_MSG_ERROR_SHARED_KEY},
-        {CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS}
-    };
-
-    if (Status >= CRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) {
-        return "Unrecognized CRYPTO_STATUS";
-    } else {
-        return mapping[Status].string;
-    }
-};
-
-
-void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m)
-{ // Alice's message encoding
-    unsigned int i = 0, j;
-        
-#if defined(GENERIC_IMPLEMENTATION)
-    for (j = 0; j < 1024; j += 4) {        
-        m[i]   = (unsigned char)(pk[j] & 0xFF);
-        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
-        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
-        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
-        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
-        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
-        m[i+6] = (unsigned char)(pk[j+3] >> 6);
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    encode_asm(pk, m);
-    i = 1792;
-#endif
-
-    for (j = 0; j < 32; j++) {
-        m[i+j] = seed[j];
-    }
-}
-
-
-void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed)
-{ // Alice's message decoding 
-    unsigned int i = 0, j;
-    
-#if defined(GENERIC_IMPLEMENTATION)
-    for (j = 0; j < 1024; j += 4) {        
-        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
-        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
-        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
-        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    decode_asm(m, pk);
-    i = 1792;
-#endif
-
-    for (j = 0; j < 32; j++) {
-        seed[j] = m[i+j];
-    }
-}
-
-
-void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m)
-{ // Bob's message encoding
-    unsigned int i = 0, j;
-    
-#if defined(GENERIC_IMPLEMENTATION) 
-    for (j = 0; j < 1024; j += 4) {        
-        m[i]   = (unsigned char)(pk[j] & 0xFF);
-        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
-        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
-        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
-        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
-        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
-        m[i+6] = (unsigned char)(pk[j+3] >> 6);
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    encode_asm(pk, m);
-#endif
-
-    i = 0;
-    for (j = 0; j < 1024/4; j++) {
-        m[1792+j] = (unsigned char)(rvec[i] | (rvec[i+1] << 2) | (rvec[i+2] << 4) | (rvec[i+3] << 6));
-        i += 4;
-    }
-}
-
-
-void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec)
-{ // Bob's message decoding
-    unsigned int i = 0, j;
-    
-#if defined(GENERIC_IMPLEMENTATION) 
-    for (j = 0; j < 1024; j += 4) {        
-        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
-        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
-        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
-        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    decode_asm(m, pk);
-    i = 1792;
-#endif
-    
-    i = 0;
-    for (j = 0; j < 1024/4; j++) {
-        rvec[i]   = (uint32_t)(m[1792+j] & 0x03);
-        rvec[i+1] = (uint32_t)((m[1792+j] >> 2) & 0x03);
-        rvec[i+2] = (uint32_t)((m[1792+j] >> 4) & 0x03);
-        rvec[i+3] = (uint32_t)(m[1792+j] >> 6);
-        i += 4;
-    }
-}
-
-
-static __inline uint32_t Abs(int32_t value)
-{ // Compute absolute value
-    uint32_t mask;
-
-    mask = (uint32_t)(value >> 31);
-    return ((mask ^ value) - mask);
-}
-
-
-CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
-{ // Reconciliation helper
-    (void)seed; (void)nonce; (void)StreamOutputFunction;
-    unsigned int i, j, norm;
-    unsigned char bit, random_bits[32];
-    uint32_t v0[4], v1[4];
-
-    randombytes( random_bits, 32);
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
-
-#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)         
-    helprec_asm(x, rvec, random_bits);
-#else   
-
-    for (i = 0; i < 256; i++) {
-        bit = 1 & (random_bits[i >> 3] >> (i & 0x07));
-        rvec[i]     = (x[i]     << 1) - bit;  
-        rvec[i+256] = (x[i+256] << 1) - bit;
-        rvec[i+512] = (x[i+512] << 1) - bit;
-        rvec[i+768] = (x[i+768] << 1) - bit; 
-
-        norm = 0;
-        v0[0] = 4; v0[1] = 4; v0[2] = 4; v0[3] = 4;
-        v1[0] = 3; v1[1] = 3; v1[2] = 3; v1[3] = 3; 
-        for (j = 0; j < 4; j++) {
-            v0[j] -= (rvec[i+256*j] - PARAMETER_Q4 ) >> 31;
-            v0[j] -= (rvec[i+256*j] - PARAMETER_3Q4) >> 31;
-            v0[j] -= (rvec[i+256*j] - PARAMETER_5Q4) >> 31;
-            v0[j] -= (rvec[i+256*j] - PARAMETER_7Q4) >> 31;
-            v1[j] -= (rvec[i+256*j] - PARAMETER_Q2 ) >> 31;
-            v1[j] -= (rvec[i+256*j] - PARAMETER_Q  ) >> 31;
-            v1[j] -= (rvec[i+256*j] - PARAMETER_3Q2) >> 31;
-            norm += Abs(2*rvec[i+256*j] - PARAMETER_Q*v0[j]);
-        }
-
-        norm = (uint32_t)((int32_t)(norm - PARAMETER_Q) >> 31);    // If norm < q then norm = 0xff...ff, else norm = 0
-        v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0];
-        v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1];
-        v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2];
-        v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3];
-        rvec[i]     = (v0[0] - v0[3]) & 0x03;
-        rvec[i+256] = (v0[1] - v0[3]) & 0x03;
-        rvec[i+512] = (v0[2] - v0[3]) & 0x03;
-        rvec[i+768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03;
-    }
-#endif
-
-    return Status;
-}
-
-
-static __inline uint32_t LDDecode(int32_t* t)
-{ // Low-density decoding
-    unsigned int i, norm = 0;
-    uint32_t mask1, mask2, value;
-    int32_t cneg = -8*PARAMETER_Q;
-    
-    for (i = 0; i < 4; i++) { 
-        mask1 = t[i] >> 31;                                    // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0
-        mask2 = (4*PARAMETER_Q - (int32_t)Abs(t[i])) >> 31;    // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff
-
-        value = ((mask1 & (8*PARAMETER_Q ^ cneg)) ^ cneg);
-        norm += Abs(t[i] + (mask2 & value));
-    }
-
-    return ((8*PARAMETER_Q - norm) >> 31) ^ 1;                 // If norm < PARAMETER_Q then return 1, else return 0
-}
-
-
-void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key)               
-{ // Reconciliation
-
-#if defined(GENERIC_IMPLEMENTATION)
-    unsigned int i;
-    uint32_t t[4];
-
-    for (i = 0; i < 32; i++) {
-        key[i] = 0;
-    }
-    for (i = 0; i < 256; i++) {        
-        t[0] = 8*x[i]     - (2*rvec[i] + rvec[i+768]) * PARAMETER_Q;
-        t[1] = 8*x[i+256] - (2*rvec[i+256] + rvec[i+768]) * PARAMETER_Q;
-        t[2] = 8*x[i+512] - (2*rvec[i+512] + rvec[i+768]) * PARAMETER_Q;
-        t[3] = 8*x[i+768] - (rvec[i+768]) * PARAMETER_Q;
-      
-        key[i >> 3] |= (unsigned char)LDDecode((int32_t*)t) << (i & 0x07);
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    rec_asm(x, rvec, key);
-#endif
-}
-
-
-CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
-{ // Error sampling
-    (void) seed; (void) nonce; (void) StreamOutputFunction;
-    unsigned char stream[3 * PARAMETER_N];
-    uint32_t *pstream = (uint32_t *) &stream;
-    uint32_t acc1, acc2, temp;
-    uint8_t *pacc1 = (uint8_t *) &acc1, *pacc2 = (uint8_t *) &acc2;
-    unsigned int i, j;
-
-    randombytes( stream, 3 * PARAMETER_N);
-
-#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)
-    error_sampling_asm(stream, e);
-#else
-    for (i = 0; i < PARAMETER_N / 4; i++) {
-        acc1 = 0;
-        acc2 = 0;
-        for (j = 0; j < 8; j++) {
-            acc1 += (pstream[i] >> j) & 0x01010101;
-            acc2 += (pstream[i + PARAMETER_N / 4] >> j) & 0x01010101;
-        }
-        for (j = 0; j < 4; j++) {
-            temp = pstream[i + 2 * PARAMETER_N / 4] >> j;
-            acc1 += temp & 0x01010101;
-            acc2 += (temp >> 4) & 0x01010101;
-        }
-        e[2 * i] = pacc1[0] - pacc1[1];
-        e[2 * i + 1] = pacc1[2] - pacc1[3];
-        e[2 * i + PARAMETER_N / 2] = pacc2[0] - pacc2[1];
-        e[2 * i + PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3];
-    }
-#endif
-
-    return CRYPTO_MSRLN_SUCCESS;    
-}
-
-
-CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction)
-{ // Generation of parameter a
-    (void)ExtendableOutputFunction;
-    unsigned int pos = 0, ctr = 0;
-    uint16_t val;
-    unsigned int nblocks = 16;
-    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
-    //Keccak_HashInstance ks;
-
-    uint64_t state[SHA3_STATESIZE] = {0};
-    shake128_absorb(state, seed, SEED_BYTES);
-    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-
-    /*#ifdef _WIN32
-        SHAKE128_InitAbsorb( &ks, seed, SEED_BYTES );
-        KECCAK_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 );
-    #else 
-        Keccak_HashInitialize_SHAKE128(&ks);
-        Keccak_HashUpdate( &ks, seed, SEED_BYTES * 8 );
-        Keccak_HashFinal( &ks, seed );
-        Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
-    //#endif
-    */
-    while (ctr < PARAMETER_N) {
-        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
-        if (val < PARAMETER_Q) {
-            a[ctr++] = val;
-        }
-        pos += 2;
-        if (pos > SHAKE128_RATE * nblocks - 2) {
-            nblocks = 1;
-          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
-            pos = 0;
-        }
-    }
-
-    return CRYPTO_MSRLN_SUCCESS;    
-}
-
-
-CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto)
-{ // Alice's key generation  
-  // It produces a private key SecretKeyA and computes the public key PublicKeyA.
-  // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-  //          the public key PublicKeyA that occupies 1824 bytes
-  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-    uint32_t a[PARAMETER_N];
-    int32_t e[PARAMETER_N];
-    unsigned char seed[SEED_BYTES];
-    unsigned char error_seed[ERROR_SEED_BYTES];
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
-
-    Status = randombytes( seed, SEED_BYTES);
-
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        return Status;
-    }   
-
-    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-
-    Status = get_error(SecretKeyA, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);   
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-    NTT_CT_std2rev_12289(SecretKeyA, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    smul(e, 3, PARAMETER_N);
-
-    pmuladd((int32_t*)a, SecretKeyA, e, (int32_t*)a, PARAMETER_N); 
-    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
-    encode_A(a, seed, PublicKeyA);
-    
-cleanup:
-    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
-
-    return Status;
-}
-
-
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto)
-{ // Bob's key generation and shared secret computation  
-  // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
-  // the shared secret SharedSecretB.
-  // Input:   Alice's public key PublicKeyA that consists of 1824 bytes
-  // Outputs: the public key PublicKeyB that occupies 2048 bytes.
-  //          the 256-bit shared secret SharedSecretB.
-  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-    uint32_t pk_A[PARAMETER_N], a[PARAMETER_N], v[PARAMETER_N], r[PARAMETER_N];
-    int32_t sk_B[PARAMETER_N], e[PARAMETER_N];
-    unsigned char seed[SEED_BYTES], error_seed[ERROR_SEED_BYTES];
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
-
-    decode_A(PublicKeyA, pk_A, seed);
-
-    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-
-    Status = get_error(sk_B, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }   
-    NTT_CT_std2rev_12289(sk_B, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    smul(e, 3, PARAMETER_N);
-
-    pmuladd((int32_t*)a, sk_B, e, (int32_t*)a, PARAMETER_N); 
-    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
-     
-    Status = get_error(e, error_seed, 2, pLatticeCrypto->StreamOutputFunction);  
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }   
-    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    smul(e, 81, PARAMETER_N);
-    
-    pmuladd((int32_t*)pk_A, sk_B, e, (int32_t*)v, PARAMETER_N);    
-    INTT_GS_rev2std_12289((int32_t*)v, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
-    two_reduce12289((int32_t*)v, PARAMETER_N);
-#if defined(GENERIC_IMPLEMENTATION)
-    correction((int32_t*)v, PARAMETER_Q, PARAMETER_N); 
-#endif
-
-    Status = HelpRec(v, r, error_seed, 3, pLatticeCrypto->StreamOutputFunction); 
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }   
-    Rec(v, r, SharedSecretB);
-    encode_B(a, r, PublicKeyB);
-    
-cleanup:
-    clear_words((void*)sk_B, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
-    clear_words((void*)a, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)v, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
-
-    return Status;
-}
-
-
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA)
-{ // Alice's shared secret computation  
-  // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
-  // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
-  //         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-  // Output: the 256-bit shared secret SharedSecretA.
-  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-    uint32_t u[PARAMETER_N], r[PARAMETER_N];
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
-
-    decode_B(PublicKeyB, u, r);
-    
-    pmul(SecretKeyA, (int32_t*)u, (int32_t*)u, PARAMETER_N);       
-    INTT_GS_rev2std_12289((int32_t*)u, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
-    two_reduce12289((int32_t*)u, PARAMETER_N);
-#if defined(GENERIC_IMPLEMENTATION)
-    correction((int32_t*)u, PARAMETER_Q, PARAMETER_N); 
-#endif
-
-    Rec(u, r, SharedSecretA);
-    
-// Cleanup
-    clear_words((void*)u, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
-
-    return Status;
-}
+#include <stdio.h>
+#include <stdlib.h>
+#include "msrln_priv.h"
+
+#include "KeccakHash.h"
+#include "SimpleFIPS202.h"
+
+
+// N^-1 * prime_scale^-8
+const int32_t MSRLN_Ninv8_ntt1024_12289 = 8350;
+// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1]
+const int32_t MSRLN_omegainv7N_rev_ntt1024_12289 = 795;
+// N^-1 * prime_scale^-11
+const int32_t MSRLN_Ninv11_ntt1024_12289 = 2585;
+// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1]
+const int32_t MSRLN_omegainv10N_rev_ntt1024_12289 = 10953;
+
+
+// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy),
+// where xxx is parameter N and yyy is the prime q.
+
+const int32_t MSRLN_psi_rev_ntt1024_12289[1024] = {
+8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201,
+875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859,
+7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626,
+3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042,
+3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563,
+7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266,
+5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934,
+8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842,
+11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541,
+11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969,
+8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863,
+8974, 9551, 5868, 9634, 5735,11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333,
+10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455,7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656,
+64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449,
+4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033,
+8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912,
+1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029,
+6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105,
+8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993,
+3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763,
+125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187,
+3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063,
+4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377,
+4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170,
+10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982,
+6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550,
+8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120,
+6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769,
+4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000,
+5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789,
+10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946,
+6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578,
+7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259,
+3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790,
+5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253,
+4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939
+};
+
+
+const int32_t MSRLN_omegainv_rev_ntt1024_12289[1024] = {
+8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422,
+6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270,
+2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957,
+8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372,
+10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300,
+5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534,
+145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567,
+6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170,
+10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404,
+1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162,
+12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643,
+6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018,
+1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239,
+11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257,
+3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919,
+8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483,
+6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447,
+6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036,
+1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038,
+6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120,
+2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626,
+11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015,
+10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570,
+6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273,
+8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767,
+8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468,
+3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906,
+6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492,
+8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635,
+9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392,
+5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520,
+3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566,
+5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678,
+4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873,
+2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448
+};
+
+
+const int32_t MSRLN_psi_rev_ntt512_12289[512] = {
+8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607,
+4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390,
+11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437,
+12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178,
+1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790,
+2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810,
+1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863,
+10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893,
+7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517,
+4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757,
+9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297,
+6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633,
+12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540,
+5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367,
+8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369,
+9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600,
+3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184
+};
+
+
+const int32_t MSRLN_omegainv_rev_ntt512_12289[512] = {
+8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302,
+8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752,
+12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186,
+11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247,
+9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821,
+11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548,
+4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023,
+2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255,
+11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624,
+9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169,
+3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035,
+3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752,
+4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656,
+2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404,
+325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975,
+10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908,
+11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434
+};
+
+// import external code
+#ifdef RLWE_ASM_AVX2
+    #include "AMD64/consts.c"
+    #include "AMD64/ntt_x64.c"
+#else
+    #include "generic/ntt.c"
+#endif
+
+__inline void clear_words(void* mem, digit_t nwords)
+{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
+  // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing.
+    unsigned int i;
+    volatile digit_t *v = mem; 
+
+    for (i = 0; i < nwords; i++) {
+        v[i] = 0;
+    }
+}
+
+
+CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction)
+{ // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
+
+    pLatticeCrypto->RandomBytesFunction = RandomBytesFunction;
+    pLatticeCrypto->ExtendableOutputFunction = ExtendableOutputFunction;
+    pLatticeCrypto->StreamOutputFunction = StreamOutputFunction;
+
+    return CRYPTO_MSRLN_SUCCESS;
+}
+
+
+PLatticeCryptoStruct LatticeCrypto_allocate()
+{ // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). 
+  // Returns NULL on error.
+    PLatticeCryptoStruct LatticeCrypto = NULL;
+
+    LatticeCrypto = (PLatticeCryptoStruct)calloc(1, sizeof(LatticeCryptoStruct));
+
+    if (LatticeCrypto == NULL) {
+        return NULL;
+    }
+    return LatticeCrypto;
+}
+
+
+const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status)
+{ // Output error/success message for a given CRYPTO_STATUS
+    struct error_mapping {
+        unsigned int index;
+        char*        string;
+    } mapping[CRYPTO_STATUS_TYPE_SIZE] = {
+        {CRYPTO_MSRLN_SUCCESS, CRYPTO_MSG_SUCCESS},
+        {CRYPTO_MSRLN_ERROR, CRYPTO_MSG_ERROR},
+        {CRYPTO_MSRLN_ERROR_DURING_TEST, CRYPTO_MSG_ERROR_DURING_TEST},
+        {CRYPTO_MSRLN_ERROR_UNKNOWN, CRYPTO_MSG_ERROR_UNKNOWN},
+        {CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, CRYPTO_MSG_ERROR_NOT_IMPLEMENTED},
+        {CRYPTO_MSRLN_ERROR_NO_MEMORY, CRYPTO_MSG_ERROR_NO_MEMORY},
+        {CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, CRYPTO_MSG_ERROR_INVALID_PARAMETER},
+        {CRYPTO_MSRLN_ERROR_SHARED_KEY, CRYPTO_MSG_ERROR_SHARED_KEY},
+        {CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS}
+    };
+
+    if (Status >= CRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) {
+        return "Unrecognized CRYPTO_STATUS";
+    } else {
+        return mapping[Status].string;
+    }
+};
+
+
+void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m)
+{ // Alice's message encoding
+    unsigned int i = 0, j;
+        
+#if defined(GENERIC_IMPLEMENTATION)
+    for (j = 0; j < 1024; j += 4) {        
+        m[i]   = (unsigned char)(pk[j] & 0xFF);
+        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
+        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
+        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
+        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
+        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
+        m[i+6] = (unsigned char)(pk[j+3] >> 6);
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    encode_asm(pk, m);
+    i = 1792;
+#endif
+
+    for (j = 0; j < 32; j++) {
+        m[i+j] = seed[j];
+    }
+}
+
+
+void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed)
+{ // Alice's message decoding 
+    unsigned int i = 0, j;
+    
+#if defined(GENERIC_IMPLEMENTATION)
+    for (j = 0; j < 1024; j += 4) {        
+        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
+        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
+        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
+        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    decode_asm(m, pk);
+    i = 1792;
+#endif
+
+    for (j = 0; j < 32; j++) {
+        seed[j] = m[i+j];
+    }
+}
+
+
+void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m)
+{ // Bob's message encoding
+    unsigned int i = 0, j;
+    
+#if defined(GENERIC_IMPLEMENTATION) 
+    for (j = 0; j < 1024; j += 4) {        
+        m[i]   = (unsigned char)(pk[j] & 0xFF);
+        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
+        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
+        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
+        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
+        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
+        m[i+6] = (unsigned char)(pk[j+3] >> 6);
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    encode_asm(pk, m);
+#endif
+
+    i = 0;
+    for (j = 0; j < 1024/4; j++) {
+        m[1792+j] = (unsigned char)(rvec[i] | (rvec[i+1] << 2) | (rvec[i+2] << 4) | (rvec[i+3] << 6));
+        i += 4;
+    }
+}
+
+
+void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec)
+{ // Bob's message decoding
+    unsigned int i = 0, j;
+    
+#if defined(GENERIC_IMPLEMENTATION) 
+    for (j = 0; j < 1024; j += 4) {        
+        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
+        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
+        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
+        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    decode_asm(m, pk);
+    i = 1792;
+#endif
+    
+    i = 0;
+    for (j = 0; j < 1024/4; j++) {
+        rvec[i]   = (uint32_t)(m[1792+j] & 0x03);
+        rvec[i+1] = (uint32_t)((m[1792+j] >> 2) & 0x03);
+        rvec[i+2] = (uint32_t)((m[1792+j] >> 4) & 0x03);
+        rvec[i+3] = (uint32_t)(m[1792+j] >> 6);
+        i += 4;
+    }
+}
+
+
+static __inline uint32_t Abs(int32_t value)
+{ // Compute absolute value
+    uint32_t mask;
+
+    mask = (uint32_t)(value >> 31);
+    return ((mask ^ value) - mask);
+}
+
+
+CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
+{ // Reconciliation helper
+    (void)seed; (void)nonce; (void)StreamOutputFunction;
+    unsigned int i, j, norm;
+    unsigned char bit, random_bits[32];
+    uint32_t v0[4], v1[4];
+
+    randombytes( random_bits, 32);
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
+
+#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)         
+    helprec_asm(x, rvec, random_bits);
+#else   
+
+    for (i = 0; i < 256; i++) {
+        bit = 1 & (random_bits[i >> 3] >> (i & 0x07));
+        rvec[i]     = (x[i]     << 1) - bit;  
+        rvec[i+256] = (x[i+256] << 1) - bit;
+        rvec[i+512] = (x[i+512] << 1) - bit;
+        rvec[i+768] = (x[i+768] << 1) - bit; 
+
+        norm = 0;
+        v0[0] = 4; v0[1] = 4; v0[2] = 4; v0[3] = 4;
+        v1[0] = 3; v1[1] = 3; v1[2] = 3; v1[3] = 3; 
+        for (j = 0; j < 4; j++) {
+            v0[j] -= (rvec[i+256*j] - PARAMETER_Q4 ) >> 31;
+            v0[j] -= (rvec[i+256*j] - PARAMETER_3Q4) >> 31;
+            v0[j] -= (rvec[i+256*j] - PARAMETER_5Q4) >> 31;
+            v0[j] -= (rvec[i+256*j] - PARAMETER_7Q4) >> 31;
+            v1[j] -= (rvec[i+256*j] - PARAMETER_Q2 ) >> 31;
+            v1[j] -= (rvec[i+256*j] - PARAMETER_Q  ) >> 31;
+            v1[j] -= (rvec[i+256*j] - PARAMETER_3Q2) >> 31;
+            norm += Abs(2*rvec[i+256*j] - PARAMETER_Q*v0[j]);
+        }
+
+        norm = (uint32_t)((int32_t)(norm - PARAMETER_Q) >> 31);    // If norm < q then norm = 0xff...ff, else norm = 0
+        v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0];
+        v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1];
+        v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2];
+        v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3];
+        rvec[i]     = (v0[0] - v0[3]) & 0x03;
+        rvec[i+256] = (v0[1] - v0[3]) & 0x03;
+        rvec[i+512] = (v0[2] - v0[3]) & 0x03;
+        rvec[i+768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03;
+    }
+#endif
+
+    return Status;
+}
+
+
+static __inline uint32_t LDDecode(int32_t* t)
+{ // Low-density decoding
+    unsigned int i, norm = 0;
+    uint32_t mask1, mask2, value;
+    int32_t cneg = -8*PARAMETER_Q;
+    
+    for (i = 0; i < 4; i++) { 
+        mask1 = t[i] >> 31;                                    // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0
+        mask2 = (4*PARAMETER_Q - (int32_t)Abs(t[i])) >> 31;    // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff
+
+        value = ((mask1 & (8*PARAMETER_Q ^ cneg)) ^ cneg);
+        norm += Abs(t[i] + (mask2 & value));
+    }
+
+    return ((8*PARAMETER_Q - norm) >> 31) ^ 1;                 // If norm < PARAMETER_Q then return 1, else return 0
+}
+
+
+void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key)               
+{ // Reconciliation
+
+#if defined(GENERIC_IMPLEMENTATION)
+    unsigned int i;
+    uint32_t t[4];
+
+    for (i = 0; i < 32; i++) {
+        key[i] = 0;
+    }
+    for (i = 0; i < 256; i++) {        
+        t[0] = 8*x[i]     - (2*rvec[i] + rvec[i+768]) * PARAMETER_Q;
+        t[1] = 8*x[i+256] - (2*rvec[i+256] + rvec[i+768]) * PARAMETER_Q;
+        t[2] = 8*x[i+512] - (2*rvec[i+512] + rvec[i+768]) * PARAMETER_Q;
+        t[3] = 8*x[i+768] - (rvec[i+768]) * PARAMETER_Q;
+      
+        key[i >> 3] |= (unsigned char)LDDecode((int32_t*)t) << (i & 0x07);
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    rec_asm(x, rvec, key);
+#endif
+}
+
+
+CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
+{ // Error sampling
+    (void) seed; (void) nonce; (void) StreamOutputFunction;
+    unsigned char stream[3 * PARAMETER_N];
+    uint32_t *pstream = (uint32_t *) &stream;
+    uint32_t acc1, acc2, temp;
+    uint8_t *pacc1 = (uint8_t *) &acc1, *pacc2 = (uint8_t *) &acc2;
+    unsigned int i, j;
+
+    randombytes( stream, 3 * PARAMETER_N);
+
+#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)
+    error_sampling_asm(stream, e);
+#else
+    for (i = 0; i < PARAMETER_N / 4; i++) {
+        acc1 = 0;
+        acc2 = 0;
+        for (j = 0; j < 8; j++) {
+            acc1 += (pstream[i] >> j) & 0x01010101;
+            acc2 += (pstream[i + PARAMETER_N / 4] >> j) & 0x01010101;
+        }
+        for (j = 0; j < 4; j++) {
+            temp = pstream[i + 2 * PARAMETER_N / 4] >> j;
+            acc1 += temp & 0x01010101;
+            acc2 += (temp >> 4) & 0x01010101;
+        }
+        e[2 * i] = pacc1[0] - pacc1[1];
+        e[2 * i + 1] = pacc1[2] - pacc1[3];
+        e[2 * i + PARAMETER_N / 2] = pacc2[0] - pacc2[1];
+        e[2 * i + PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3];
+    }
+#endif
+
+    return CRYPTO_MSRLN_SUCCESS;    
+}
+
+
+CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction)
+{ // Generation of parameter a
+    (void)ExtendableOutputFunction;
+    unsigned int pos = 0, ctr = 0;
+    uint16_t val;
+    unsigned int nblocks = 16;
+    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
+    //Keccak_HashInstance ks;
+
+    uint64_t state[SHA3_STATESIZE] = {0};
+    shake128_absorb(state, seed, SEED_BYTES);
+    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+
+    /*#ifdef _WIN32
+        SHAKE128_InitAbsorb( &ks, seed, SEED_BYTES );
+        KECCAK_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 );
+    #else 
+        Keccak_HashInitialize_SHAKE128(&ks);
+        Keccak_HashUpdate( &ks, seed, SEED_BYTES * 8 );
+        Keccak_HashFinal( &ks, seed );
+        Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
+    //#endif
+    */
+    while (ctr < PARAMETER_N) {
+        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
+        if (val < PARAMETER_Q) {
+            a[ctr++] = val;
+        }
+        pos += 2;
+        if (pos > SHAKE128_RATE * nblocks - 2) {
+            nblocks = 1;
+          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
+            pos = 0;
+        }
+    }
+
+    return CRYPTO_MSRLN_SUCCESS;    
+}
+
+
+CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto)
+{ // Alice's key generation  
+  // It produces a private key SecretKeyA and computes the public key PublicKeyA.
+  // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+  //          the public key PublicKeyA that occupies 1824 bytes
+  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+    uint32_t a[PARAMETER_N];
+    int32_t e[PARAMETER_N];
+    unsigned char seed[SEED_BYTES];
+    unsigned char error_seed[ERROR_SEED_BYTES];
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
+
+    Status = randombytes( seed, SEED_BYTES);
+
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        return Status;
+    }   
+
+    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+
+    Status = get_error(SecretKeyA, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);   
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+    NTT_CT_std2rev_12289(SecretKeyA, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    smul(e, 3, PARAMETER_N);
+
+    pmuladd((int32_t*)a, SecretKeyA, e, (int32_t*)a, PARAMETER_N); 
+    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
+    encode_A(a, seed, PublicKeyA);
+    
+cleanup:
+    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
+
+    return Status;
+}
+
+
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto)
+{ // Bob's key generation and shared secret computation  
+  // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
+  // the shared secret SharedSecretB.
+  // Input:   Alice's public key PublicKeyA that consists of 1824 bytes
+  // Outputs: the public key PublicKeyB that occupies 2048 bytes.
+  //          the 256-bit shared secret SharedSecretB.
+  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+    uint32_t pk_A[PARAMETER_N], a[PARAMETER_N], v[PARAMETER_N], r[PARAMETER_N];
+    int32_t sk_B[PARAMETER_N], e[PARAMETER_N];
+    unsigned char seed[SEED_BYTES], error_seed[ERROR_SEED_BYTES];
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
+
+    decode_A(PublicKeyA, pk_A, seed);
+
+    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+
+    Status = get_error(sk_B, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }   
+    NTT_CT_std2rev_12289(sk_B, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    smul(e, 3, PARAMETER_N);
+
+    pmuladd((int32_t*)a, sk_B, e, (int32_t*)a, PARAMETER_N); 
+    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
+     
+    Status = get_error(e, error_seed, 2, pLatticeCrypto->StreamOutputFunction);  
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }   
+    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    smul(e, 81, PARAMETER_N);
+    
+    pmuladd((int32_t*)pk_A, sk_B, e, (int32_t*)v, PARAMETER_N);    
+    INTT_GS_rev2std_12289((int32_t*)v, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
+    two_reduce12289((int32_t*)v, PARAMETER_N);
+#if defined(GENERIC_IMPLEMENTATION)
+    correction((int32_t*)v, PARAMETER_Q, PARAMETER_N); 
+#endif
+
+    Status = HelpRec(v, r, error_seed, 3, pLatticeCrypto->StreamOutputFunction); 
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }   
+    Rec(v, r, SharedSecretB);
+    encode_B(a, r, PublicKeyB);
+    
+cleanup:
+    clear_words((void*)sk_B, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
+    clear_words((void*)a, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)v, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
+
+    return Status;
+}
+
+
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA)
+{ // Alice's shared secret computation  
+  // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
+  // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
+  //         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+  // Output: the 256-bit shared secret SharedSecretA.
+  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+    uint32_t u[PARAMETER_N], r[PARAMETER_N];
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
+
+    decode_B(PublicKeyB, u, r);
+    
+    pmul(SecretKeyA, (int32_t*)u, (int32_t*)u, PARAMETER_N);       
+    INTT_GS_rev2std_12289((int32_t*)u, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
+    two_reduce12289((int32_t*)u, PARAMETER_N);
+#if defined(GENERIC_IMPLEMENTATION)
+    correction((int32_t*)u, PARAMETER_Q, PARAMETER_N); 
+#endif
+
+    Rec(u, r, SharedSecretA);
+    
+// Cleanup
+    clear_words((void*)u, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
+
+    return Status;
+}
diff --git a/dap-sdk/crypto/src/msrln/makefile b/dap-sdk/crypto/src/msrln/makefile
index ab4cb800cc87091f5b3e39da35b22fca349bbffb..d017a0e2bb8c6dbcecaf109864aa3870fa6cdaea 100755
--- a/dap-sdk/crypto/src/msrln/makefile
+++ b/dap-sdk/crypto/src/msrln/makefile
@@ -1,94 +1,94 @@
-####  Makefile for compilation on Linux  ####
-
-OPT=-O3     # Optimization option by default
-
-ifeq "$(CC)" "gcc"
-    COMPILER=gcc
-else ifeq "$(CC)" "clang"
-    COMPILER=clang
-endif
-
-ifeq "$(ARCH)" "x64"
-    ARCHITECTURE=_AMD64_
-else ifeq "$(ARCH)" "x86"
-    ARCHITECTURE=_X86_
-else ifeq "$(ARCH)" "ARM"
-    ARCHITECTURE=_ARM_
-endif
-
-ADDITIONAL_SETTINGS=
-ifeq "$(SET)" "EXTENDED"
-    ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native
-endif
-
-ifeq "$(ASM)" "TRUE"
-    USE_ASM=-D _ASM_
-endif
-
-ifeq "$(GENERIC)" "TRUE"
-    USE_GENERIC=-D _GENERIC_
-endif
-
-ifeq "$(AVX2)" "TRUE"
-    USE_AVX2=-D _AVX2_
-    SIMD=-mavx2
-endif
-
-ifeq "$(ARCH)" "ARM"
-    ARM_SETTING=-lrt
-endif
-
-cc=$(COMPILER)
-CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC)
-LDFLAGS=
-ifeq "$(GENERIC)" "TRUE"
-    OTHER_OBJECTS=ntt.o
-else
-ifeq "$(ASM)" "TRUE"
-    OTHER_OBJECTS=ntt_x64.o consts.o
-    ASM_OBJECTS=ntt_x64_asm.o error_asm.o
-endif 
-endif
-OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS)
-OBJECTS_TEST=tests.o test_extras.o $(OBJECTS)
-OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST)
-
-test: $(OBJECTS_TEST)
-	$(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING)
-
-kex.o: kex.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) kex.c
-
-random.o: random.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) random.c
-
-ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) ntt_constants.c
-    
-ifeq "$(GENERIC)" "TRUE"
-    ntt.o: generic/ntt.c LatticeCrypto_priv.h
-	    $(CC) $(CFLAGS) generic/ntt.c 
-else   
-ifeq "$(ASM)" "TRUE"
-    ntt_x64.o: AMD64/ntt_x64.c
-	    $(CC) $(CFLAGS) AMD64/ntt_x64.c
-    ntt_x64_asm.o: AMD64/ntt_x64_asm.S
-	    $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S
-    error_asm.o: AMD64/error_asm.S
-	    $(CC) $(CFLAGS) AMD64/error_asm.S
-    consts.o: AMD64/consts.c
-	    $(CC) $(CFLAGS) AMD64/consts.c
-endif
-endif
-
-test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) tests/test_extras.c
-
-tests.o: tests/tests.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) tests/tests.c
-
-.PHONY: clean
-
-clean:
-	rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL)
-
+####  Makefile for compilation on Linux  ####
+
+OPT=-O3     # Optimization option by default
+
+ifeq "$(CC)" "gcc"
+    COMPILER=gcc
+else ifeq "$(CC)" "clang"
+    COMPILER=clang
+endif
+
+ifeq "$(ARCH)" "x64"
+    ARCHITECTURE=_AMD64_
+else ifeq "$(ARCH)" "x86"
+    ARCHITECTURE=_X86_
+else ifeq "$(ARCH)" "ARM"
+    ARCHITECTURE=_ARM_
+endif
+
+ADDITIONAL_SETTINGS=
+ifeq "$(SET)" "EXTENDED"
+    ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native
+endif
+
+ifeq "$(ASM)" "TRUE"
+    USE_ASM=-D _ASM_
+endif
+
+ifeq "$(GENERIC)" "TRUE"
+    USE_GENERIC=-D _GENERIC_
+endif
+
+ifeq "$(AVX2)" "TRUE"
+    USE_AVX2=-D _AVX2_
+    SIMD=-mavx2
+endif
+
+ifeq "$(ARCH)" "ARM"
+    ARM_SETTING=-lrt
+endif
+
+cc=$(COMPILER)
+CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC)
+LDFLAGS=
+ifeq "$(GENERIC)" "TRUE"
+    OTHER_OBJECTS=ntt.o
+else
+ifeq "$(ASM)" "TRUE"
+    OTHER_OBJECTS=ntt_x64.o consts.o
+    ASM_OBJECTS=ntt_x64_asm.o error_asm.o
+endif 
+endif
+OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS)
+OBJECTS_TEST=tests.o test_extras.o $(OBJECTS)
+OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST)
+
+test: $(OBJECTS_TEST)
+	$(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING)
+
+kex.o: kex.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) kex.c
+
+random.o: random.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) random.c
+
+ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) ntt_constants.c
+    
+ifeq "$(GENERIC)" "TRUE"
+    ntt.o: generic/ntt.c LatticeCrypto_priv.h
+	    $(CC) $(CFLAGS) generic/ntt.c 
+else   
+ifeq "$(ASM)" "TRUE"
+    ntt_x64.o: AMD64/ntt_x64.c
+	    $(CC) $(CFLAGS) AMD64/ntt_x64.c
+    ntt_x64_asm.o: AMD64/ntt_x64_asm.S
+	    $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S
+    error_asm.o: AMD64/error_asm.S
+	    $(CC) $(CFLAGS) AMD64/error_asm.S
+    consts.o: AMD64/consts.c
+	    $(CC) $(CFLAGS) AMD64/consts.c
+endif
+endif
+
+test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) tests/test_extras.c
+
+tests.o: tests/tests.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) tests/tests.c
+
+.PHONY: clean
+
+clean:
+	rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL)
+
diff --git a/dap-sdk/crypto/src/msrln/msrln.h b/dap-sdk/crypto/src/msrln/msrln.h
index 5b54822603c037965a6d0f48455fe0cf72d19f29..b789d0209abf55f92350734dd41fd0f75f7a9316 100755
--- a/dap-sdk/crypto/src/msrln/msrln.h
+++ b/dap-sdk/crypto/src/msrln/msrln.h
@@ -1,136 +1,136 @@
-#ifndef __MSRLN_H__
-#define __MSRLN_H__
-
-
-// For C++
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include "dap_crypto_common.h"
-
-// Definitions of the error-handling type and error codes
-
-typedef enum {
-    CRYPTO_MSRLN_SUCCESS,                          // 0x00
-    CRYPTO_MSRLN_ERROR,                            // 0x01
-    CRYPTO_MSRLN_ERROR_DURING_TEST,                // 0x02
-    CRYPTO_MSRLN_ERROR_UNKNOWN,                    // 0x03
-    CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED,            // 0x04
-    CRYPTO_MSRLN_ERROR_NO_MEMORY,                  // 0x05
-    CRYPTO_MSRLN_ERROR_INVALID_PARAMETER,          // 0x06
-    CRYPTO_MSRLN_ERROR_SHARED_KEY,                 // 0x07
-    CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS,        // 0x08
-    CRYPTO_MSRLN_ERROR_END_OF_LIST
-} CRYPTO_MSRLN_STATUS;
-
-#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_MSRLN_ERROR_END_OF_LIST)
-
-
-// Definitions of the error messages
-// NOTE: they must match the error codes above
-
-#define CRYPTO_MSG_SUCCESS                                "CRYPTO_SUCCESS"
-#define CRYPTO_MSG_ERROR                                  "CRYPTO_ERROR"
-#define CRYPTO_MSG_ERROR_DURING_TEST                      "CRYPTO_ERROR_DURING_TEST"
-#define CRYPTO_MSG_ERROR_UNKNOWN                          "CRYPTO_ERROR_UNKNOWN"
-#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED                  "CRYPTO_ERROR_NOT_IMPLEMENTED"
-#define CRYPTO_MSG_ERROR_NO_MEMORY                        "CRYPTO_ERROR_NO_MEMORY"
-#define CRYPTO_MSG_ERROR_INVALID_PARAMETER                "CRYPTO_ERROR_INVALID_PARAMETER"
-#define CRYPTO_MSG_ERROR_SHARED_KEY                       "CRYPTO_ERROR_SHARED_KEY"
-#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS              "CRYPTO_ERROR_TOO_MANY_ITERATIONS"                                                            
-
-
-// Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array"
-typedef CRYPTO_MSRLN_STATUS (*RandomBytes)(unsigned char* random_array, unsigned int nbytes);
-
-// Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array"
-typedef CRYPTO_MSRLN_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array);
-
-// Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array"
-typedef CRYPTO_MSRLN_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
-
-
-// Basic key-exchange constants  
-#define MSRLN_PKA_BYTES           1824      // Alice's public key size
-#define MSRLN_PKB_BYTES           2048      // Bob's public key size
-#define MSRLN_SHAREDKEY_BYTES     32        // Shared key size
-
-
-// This data struct is initialized during setup with user-provided functions
-typedef struct
-{
-    RandomBytes      RandomBytesFunction;               // Function providing random bytes
-    ExtendableOutput ExtendableOutputFunction;          // Extendable output function
-    StreamOutput     StreamOutputFunction;              // Stream cipher function
-} LatticeCryptoStruct, *PLatticeCryptoStruct;
-
-
-/******************** Function prototypes *******************/
-/*********************** Auxiliary API **********************/ 
-
-// Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
-extern void clear_words(void* mem, digit_t nwords);
-CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
-CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a);
-
-// Output "nbytes" of random values.
-// It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
-// The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets.
-CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction);
-
-// Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".   
-// It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
-// The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. 
-CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction);
-
-// Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
-// It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
-// The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.  
-CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction);
-
-// Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error.
-PLatticeCryptoStruct LatticeCrypto_allocate(void); 
-
-// Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
-CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction);
-
-// Output error/success message for a given CRYPTO_STATUS
-const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status);
-
-/*********************** Key exchange API ***********************/ 
-
-// Alice's key generation 
-// It produces a private key SecretKeyA and computes the public key PublicKeyA.
-// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-//          the public key PublicKeyA that occupies 1824 bytes
-// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto);
-
-// Bob's key generation and shared secret computation
-// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
-// the shared secret SharedSecretB.
-// Input:   Alice's public key PublicKeyA that consists of 1824 bytes
-// Outputs: the public key PublicKeyB that occupies 2048 bytes.
-//          the 256-bit shared secret SharedSecretB.
-// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto);
-
-// Alice's shared secret computation 
-// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
-// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
-//         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-// Output: the 256-bit shared secret SharedSecretA.
-// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif
+#ifndef __MSRLN_H__
+#define __MSRLN_H__
+
+
+// For C++
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include "dap_crypto_common.h"
+
+// Definitions of the error-handling type and error codes
+
+typedef enum {
+    CRYPTO_MSRLN_SUCCESS,                          // 0x00
+    CRYPTO_MSRLN_ERROR,                            // 0x01
+    CRYPTO_MSRLN_ERROR_DURING_TEST,                // 0x02
+    CRYPTO_MSRLN_ERROR_UNKNOWN,                    // 0x03
+    CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED,            // 0x04
+    CRYPTO_MSRLN_ERROR_NO_MEMORY,                  // 0x05
+    CRYPTO_MSRLN_ERROR_INVALID_PARAMETER,          // 0x06
+    CRYPTO_MSRLN_ERROR_SHARED_KEY,                 // 0x07
+    CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS,        // 0x08
+    CRYPTO_MSRLN_ERROR_END_OF_LIST
+} CRYPTO_MSRLN_STATUS;
+
+#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_MSRLN_ERROR_END_OF_LIST)
+
+
+// Definitions of the error messages
+// NOTE: they must match the error codes above
+
+#define CRYPTO_MSG_SUCCESS                                "CRYPTO_SUCCESS"
+#define CRYPTO_MSG_ERROR                                  "CRYPTO_ERROR"
+#define CRYPTO_MSG_ERROR_DURING_TEST                      "CRYPTO_ERROR_DURING_TEST"
+#define CRYPTO_MSG_ERROR_UNKNOWN                          "CRYPTO_ERROR_UNKNOWN"
+#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED                  "CRYPTO_ERROR_NOT_IMPLEMENTED"
+#define CRYPTO_MSG_ERROR_NO_MEMORY                        "CRYPTO_ERROR_NO_MEMORY"
+#define CRYPTO_MSG_ERROR_INVALID_PARAMETER                "CRYPTO_ERROR_INVALID_PARAMETER"
+#define CRYPTO_MSG_ERROR_SHARED_KEY                       "CRYPTO_ERROR_SHARED_KEY"
+#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS              "CRYPTO_ERROR_TOO_MANY_ITERATIONS"                                                            
+
+
+// Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array"
+typedef CRYPTO_MSRLN_STATUS (*RandomBytes)(unsigned char* random_array, unsigned int nbytes);
+
+// Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array"
+typedef CRYPTO_MSRLN_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array);
+
+// Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array"
+typedef CRYPTO_MSRLN_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
+
+
+// Basic key-exchange constants  
+#define MSRLN_PKA_BYTES           1824      // Alice's public key size
+#define MSRLN_PKB_BYTES           2048      // Bob's public key size
+#define MSRLN_SHAREDKEY_BYTES     32        // Shared key size
+
+
+// This data struct is initialized during setup with user-provided functions
+typedef struct
+{
+    RandomBytes      RandomBytesFunction;               // Function providing random bytes
+    ExtendableOutput ExtendableOutputFunction;          // Extendable output function
+    StreamOutput     StreamOutputFunction;              // Stream cipher function
+} LatticeCryptoStruct, *PLatticeCryptoStruct;
+
+
+/******************** Function prototypes *******************/
+/*********************** Auxiliary API **********************/ 
+
+// Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
+extern void clear_words(void* mem, digit_t nwords);
+CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
+CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a);
+
+// Output "nbytes" of random values.
+// It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
+// The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets.
+CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction);
+
+// Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".   
+// It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
+// The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. 
+CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction);
+
+// Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
+// It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
+// The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.  
+CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction);
+
+// Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error.
+PLatticeCryptoStruct LatticeCrypto_allocate(void); 
+
+// Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
+CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction);
+
+// Output error/success message for a given CRYPTO_STATUS
+const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status);
+
+/*********************** Key exchange API ***********************/ 
+
+// Alice's key generation 
+// It produces a private key SecretKeyA and computes the public key PublicKeyA.
+// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+//          the public key PublicKeyA that occupies 1824 bytes
+// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto);
+
+// Bob's key generation and shared secret computation
+// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
+// the shared secret SharedSecretB.
+// Input:   Alice's public key PublicKeyA that consists of 1824 bytes
+// Outputs: the public key PublicKeyB that occupies 2048 bytes.
+//          the 256-bit shared secret SharedSecretB.
+// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto);
+
+// Alice's shared secret computation 
+// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
+// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
+//         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+// Output: the 256-bit shared secret SharedSecretA.
+// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/dap-sdk/crypto/src/msrln/msrln.pri b/dap-sdk/crypto/src/msrln/msrln.pri
index cd4600ef3ddb8f7a762dfb08774f631967d8282a..f42be38c96799ba08b86fc19a10e391489597fe1 100755
--- a/dap-sdk/crypto/src/msrln/msrln.pri
+++ b/dap-sdk/crypto/src/msrln/msrln.pri
@@ -1,6 +1,6 @@
-INCLUDEPATH += $$PWD
-
-HEADERS += $$PWD/msrln.h \
-
-SOURCES += $$PWD/kex.c \
-           $$PWD/random.c \
+INCLUDEPATH += $$PWD
+
+HEADERS += $$PWD/msrln.h \
+
+SOURCES += $$PWD/kex.c \
+           $$PWD/random.c \
diff --git a/dap-sdk/crypto/src/msrln/msrln_priv.h b/dap-sdk/crypto/src/msrln/msrln_priv.h
index fdaae50ad30677ee0f51f9c3044c5e63f27ce430..cc1f19801010e65a41ef323914f82db9d337fce3 100755
--- a/dap-sdk/crypto/src/msrln/msrln_priv.h
+++ b/dap-sdk/crypto/src/msrln/msrln_priv.h
@@ -1,114 +1,114 @@
-#ifndef __MSRLN_priv_H__
-#define __MSRLN_priv_H__
-
-// For C++
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "msrln.h"
-
-// Basic constants            
-#define PARAMETER_N         1024 
-#define PARAMETER_Q         12289 
-#define SEED_BYTES          256/8
-#define ERROR_SEED_BYTES    256/8
-#define NONCE_SEED_BYTES    256/8
-#define PARAMETER_Q4        3073 
-#define PARAMETER_3Q4       9217 
-#define PARAMETER_5Q4       15362 
-#define PARAMETER_7Q4       21506 
-#define PARAMETER_Q2        6145 
-#define PARAMETER_3Q2       18434
-    
-
-// Macro definitions
-
-#define NBITS_TO_NWORDS(nbits)      (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8))    // Conversion macro from number of bits to number of computer words
-#define NBYTES_TO_NWORDS(nbytes)    (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t))           // Conversion macro from number of bytes to number of computer words
-
-// Macro to avoid compiler warnings when detecting unreferenced parameters
-#ifndef UNREFERENCED_PARAMETER
-#define UNREFERENCED_PARAMETER(PAR) ((void)PAR)
-#endif
-
-
-/******************** Function prototypes *******************/
-/******************* Polynomial functions *******************/
-
-// Forward NTT
-void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N);
-void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N);
-
-// Inverse NTT
-void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
-void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
-
-// Reduction modulo q
-int32_t reduce12289(int64_t a);
-
-// Two merged reductions modulo q
-int32_t reduce12289_2x(int64_t a);
-
-// Two consecutive reductions modulo q
-void two_reduce12289(int32_t* a, unsigned int N);
-void two_reduce12289_asm(int32_t* a, unsigned int N);
-
-// Correction modulo q
-void correction(int32_t* a, int32_t p, unsigned int N);
-
-// Component-wise multiplication
-void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
-void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
-
-// Component-wise multiplication and addition
-void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
-void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
-
-// Component-wise multiplication with scalar
-void smul(int32_t* a, int32_t scalar, unsigned int N);
-
-/******************* Key exchange functions *******************/
-
-// Alice's message encoding
-void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m);
-
-// Alice's message decoding
-void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); 
-    
-// Bob's message encoding
-void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m);
-    
-// Bob's message decoding
-void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec);
-
-// Partial message encoding/decoding (assembly optimized) 
-void encode_asm(const uint32_t* pk, unsigned char* m);
-void decode_asm(const unsigned char* m, uint32_t *pk);
-
-// Reconciliation helper
-CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
-
-// Partial reconciliation helper (assembly optimized)        
-void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits);
-
-// Reconciliation
-void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
-void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
-
-// Error sampling
-CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
-
-// Partial error sampling (assembly optimized)        
-void error_sampling_asm(unsigned char* stream, int32_t* e);
-
-// Generation of parameter a
-CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif
+#ifndef __MSRLN_priv_H__
+#define __MSRLN_priv_H__
+
+// For C++
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "msrln.h"
+
+// Basic constants            
+#define PARAMETER_N         1024 
+#define PARAMETER_Q         12289 
+#define SEED_BYTES          256/8
+#define ERROR_SEED_BYTES    256/8
+#define NONCE_SEED_BYTES    256/8
+#define PARAMETER_Q4        3073 
+#define PARAMETER_3Q4       9217 
+#define PARAMETER_5Q4       15362 
+#define PARAMETER_7Q4       21506 
+#define PARAMETER_Q2        6145 
+#define PARAMETER_3Q2       18434
+    
+
+// Macro definitions
+
+#define NBITS_TO_NWORDS(nbits)      (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8))    // Conversion macro from number of bits to number of computer words
+#define NBYTES_TO_NWORDS(nbytes)    (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t))           // Conversion macro from number of bytes to number of computer words
+
+// Macro to avoid compiler warnings when detecting unreferenced parameters
+#ifndef UNREFERENCED_PARAMETER
+#define UNREFERENCED_PARAMETER(PAR) ((void)PAR)
+#endif
+
+
+/******************** Function prototypes *******************/
+/******************* Polynomial functions *******************/
+
+// Forward NTT
+void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N);
+void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N);
+
+// Inverse NTT
+void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
+void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
+
+// Reduction modulo q
+int32_t reduce12289(int64_t a);
+
+// Two merged reductions modulo q
+int32_t reduce12289_2x(int64_t a);
+
+// Two consecutive reductions modulo q
+void two_reduce12289(int32_t* a, unsigned int N);
+void two_reduce12289_asm(int32_t* a, unsigned int N);
+
+// Correction modulo q
+void correction(int32_t* a, int32_t p, unsigned int N);
+
+// Component-wise multiplication
+void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
+void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
+
+// Component-wise multiplication and addition
+void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
+void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
+
+// Component-wise multiplication with scalar
+void smul(int32_t* a, int32_t scalar, unsigned int N);
+
+/******************* Key exchange functions *******************/
+
+// Alice's message encoding
+void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m);
+
+// Alice's message decoding
+void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); 
+    
+// Bob's message encoding
+void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m);
+    
+// Bob's message decoding
+void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec);
+
+// Partial message encoding/decoding (assembly optimized) 
+void encode_asm(const uint32_t* pk, unsigned char* m);
+void decode_asm(const unsigned char* m, uint32_t *pk);
+
+// Reconciliation helper
+CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
+
+// Partial reconciliation helper (assembly optimized)        
+void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits);
+
+// Reconciliation
+void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
+void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
+
+// Error sampling
+CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
+
+// Partial error sampling (assembly optimized)        
+void error_sampling_asm(unsigned char* stream, int32_t* e);
+
+// Generation of parameter a
+CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/dap-sdk/crypto/src/msrln/random.c b/dap-sdk/crypto/src/msrln/random.c
index ab2b129f84160d3d334f2eb88da03b5b0c96e948..eaea6a1170d0203c6169621296fbe290e4a4011e 100755
--- a/dap-sdk/crypto/src/msrln/random.c
+++ b/dap-sdk/crypto/src/msrln/random.c
@@ -1,90 +1,90 @@
-#include "msrln_priv.h"
-
-//#include "KeccakHash.h"
-//#include "SimpleFIPS202.h"
-
-#define LOG_TAG "RANDOM"
-
-CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a)
-{
-    // Generation of parameter a
-    unsigned int pos = 0, ctr = 0;
-    uint16_t val;
-    unsigned int nblocks = 16;
-    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
-    //Keccak_HashInstance ks;
-
-    uint64_t state[SHA3_STATESIZE];
-    shake128_absorb(state, seed, seed_nbytes);
-    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-
-    /*Keccak_HashInitialize_SHAKE128(&ks);
-    Keccak_HashUpdate( &ks, seed, seed_nbytes * 8 );
-    Keccak_HashFinal( &ks, seed );
-    Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );*/
-
-    while (ctr < array_ndigits) {
-        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
-        if (val < PARAMETER_Q) {
-            a[ctr++] = val;
-        }
-        pos += 2;
-        if (pos > SHAKE128_RATE * nblocks - 2) {
-            nblocks = 1;
-          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
-            pos = 0;
-        }
-    }
-    return CRYPTO_MSRLN_SUCCESS;
-}
-
-CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array)
-{
-    UNREFERENCED_PARAMETER(seed);
-    UNREFERENCED_PARAMETER(seed_nbytes);
-    UNREFERENCED_PARAMETER(nonce);
-    UNREFERENCED_PARAMETER(nonce_nbytes);
-
-    randombytes( stream_array, array_nbytes);
-
-    return CRYPTO_MSRLN_SUCCESS;
-}
-
-CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction)
-{ // Output "nbytes" of random values.
-  // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
-  // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets.
-
-    if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) {
-        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
-    }    
-    
-    return (RandomBytesFunction)(random_array, nbytes);
-}
-
-
-CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction)
-{ // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".
-  // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
-  // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits.
-
-    if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) {
-        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
-    }    
-    
-    return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array);
-}
-
-
-CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction)
-{ // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
-  // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
-  // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.
-
-    if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) {
-        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
-    }    
-    
-    return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array);
-}
+#include "msrln_priv.h"
+
+//#include "KeccakHash.h"
+//#include "SimpleFIPS202.h"
+
+#define LOG_TAG "RANDOM"
+
+CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a)
+{
+    // Generation of parameter a
+    unsigned int pos = 0, ctr = 0;
+    uint16_t val;
+    unsigned int nblocks = 16;
+    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
+    //Keccak_HashInstance ks;
+
+    uint64_t state[SHA3_STATESIZE];
+    shake128_absorb(state, seed, seed_nbytes);
+    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+
+    /*Keccak_HashInitialize_SHAKE128(&ks);
+    Keccak_HashUpdate( &ks, seed, seed_nbytes * 8 );
+    Keccak_HashFinal( &ks, seed );
+    Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );*/
+
+    while (ctr < array_ndigits) {
+        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
+        if (val < PARAMETER_Q) {
+            a[ctr++] = val;
+        }
+        pos += 2;
+        if (pos > SHAKE128_RATE * nblocks - 2) {
+            nblocks = 1;
+          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
+            pos = 0;
+        }
+    }
+    return CRYPTO_MSRLN_SUCCESS;
+}
+
+CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array)
+{
+    UNREFERENCED_PARAMETER(seed);
+    UNREFERENCED_PARAMETER(seed_nbytes);
+    UNREFERENCED_PARAMETER(nonce);
+    UNREFERENCED_PARAMETER(nonce_nbytes);
+
+    randombytes( stream_array, array_nbytes);
+
+    return CRYPTO_MSRLN_SUCCESS;
+}
+
+CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction)
+{ // Output "nbytes" of random values.
+  // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
+  // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets.
+
+    if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) {
+        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
+    }    
+    
+    return (RandomBytesFunction)(random_array, nbytes);
+}
+
+
+CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction)
+{ // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".
+  // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
+  // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits.
+
+    if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) {
+        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
+    }    
+    
+    return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array);
+}
+
+
+CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction)
+{ // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
+  // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
+  // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.
+
+    if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) {
+        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
+    }    
+    
+    return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array);
+}
diff --git a/dap-sdk/crypto/src/sig_picnic/picnic_impl.c b/dap-sdk/crypto/src/sig_picnic/picnic_impl.c
index 2a0e25ffe8d35f7479ed5b1a189514bfbad34f7d..4e9b1a329b3baba18cef0c7bd180685ccb2977e3 100755
--- a/dap-sdk/crypto/src/sig_picnic/picnic_impl.c
+++ b/dap-sdk/crypto/src/sig_picnic/picnic_impl.c
@@ -1,998 +1,998 @@
-/*! @file picnic_impl.c
- *  @brief This is the main file of the signature scheme. All of the LowMC MPC
- *  code is here as well as lower-level versions of sign and verify that are
- *  called by the signature API.
- *
- *  This file is part of the reference implementation of the Picnic signature scheme.
- *  See the accompanying documentation for complete details.
- *
- *  The code is provided under the MIT license, see LICENSE for
- *  more details.
- *  SPDX-License-Identifier: MIT
- */
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#if defined (__WIN32)
-	#include <windows.h>
-	#include <bcrypt.h>
-#elif defined (__APPLE__)
-    #include "macos_specific_endian.h"
-#else
-    #include <endian.h>
-#endif
-
-#include "picnic_impl.h"
-#include "picnic.h"
-#include "platform.h"
-#include "lowmc_constants.h"
-#include "hash.h"
-#include "picnic_types.h"
-#include "dap_common.h"
-
-
-#define VIEW_OUTPUTS(i, j) viewOutputs[(i) * 3 + (j)]
-
-
-/* Helper functions */
-uint16_t toLittleEndian(uint16_t x)
-{
-#if defined(__WIN32)
-    #if BYTE_ORDER == LITTLE_ENDIAN
-		return x;
-	#else
-		return __builtin_bswap16(x);
-    #endif
-#else
-    return htole16(x);
-#endif
-}
-
-/* Get one bit from a byte array */
-uint8_t getBit(const uint8_t* array, uint32_t bitNumber)
-{
-    return (array[bitNumber / 8] >> (7 - (bitNumber % 8))) & 0x01;
-}
-
-/* Get one bit from a 32-bit int array */
-uint8_t getBitFromWordArray(const uint32_t* array, uint32_t bitNumber)
-{
-    return getBit((uint8_t*)array, bitNumber);
-}
-
-/* Set a specific bit in a byte array to a given value */
-void setBit(uint8_t* bytes, uint32_t bitNumber, uint8_t val)
-{
-    bytes[bitNumber / 8] = (bytes[bitNumber >> 3]
-                            & ~(1 << (7 - (bitNumber % 8)))) | (val << (7 - (bitNumber % 8)));
-}
-
-/* Set a specific bit in a byte array to a given value */
-void setBitInWordArray(uint32_t* array, uint32_t bitNumber, uint8_t val)
-{
-    setBit((uint8_t*)array, bitNumber, val);
-}
-
-static uint8_t parity(uint32_t* data, size_t len)
-{
-    uint32_t x = data[0];
-    size_t i;
-    for (i = 1; i < len; i++) {
-        x ^= data[i];
-    }
-
-    /* Compute parity of x using code from Section 5-2 of
-     * H.S. Warren, *Hacker's Delight*, Pearson Education, 2003.
-     * http://www.hackersdelight.org/hdcodetxt/parity.c.txt
-     */
-    uint32_t y = x ^ (x >> 1);
-    y ^= (y >> 2);
-    y ^= (y >> 4);
-    y ^= (y >> 8);
-    y ^= (y >> 16);
-    return y & 1;
-}
-
-uint32_t numBytes(uint32_t numBits)
-{
-    return (numBits == 0) ? 0 : ((numBits - 1) / 8 + 1);
-}
-
-static void xor_array(const uint32_t * in1, const uint32_t * in2, uint32_t * out, uint32_t numBytes)
-{
-    uint32_t i;
-    for (i = 0; i < numBytes; i++) {
-        out[i] = in1[i] ^ in2[i];
-    }
-}
-
-static void matrix_mul(
-    uint32_t* state,
-    const uint32_t* matrix,
-    uint32_t* output,
-    paramset_t* params)
-{
-    // Use temp to correctly handle the case when state = output
-    uint32_t prod[LOWMC_MAX_STATE_SIZE];
-    uint32_t temp[LOWMC_MAX_STATE_SIZE];
-
-    uint32_t i, j;
-    for (i = 0; i < params->stateSizeBits; i++) {
-        for (j = 0; j < params->stateSizeWords; j++) {
-            size_t index = i * params->stateSizeWords + j;
-            prod[j] = (state[j] & matrix[index]);
-        }
-        setBit((uint8_t*)temp, i, parity(&prod[0], params->stateSizeWords));
-
-    }
-    memcpy(output, &temp, params->stateSizeWords * sizeof(uint32_t));
-}
-
-static void substitution(uint32_t* state, paramset_t* params)
-{
-    uint32_t i;
-    for (i = 0; i < params->numSboxes * 3; i += 3) {
-        uint8_t a = getBitFromWordArray(state, i + 2);
-        uint8_t b = getBitFromWordArray(state, i + 1);
-        uint8_t c = getBitFromWordArray(state, i);
-
-        setBitInWordArray(state, i + 2, a ^ (b & c));
-        setBitInWordArray(state, i + 1, a ^ b ^ (a & c));
-        setBitInWordArray(state, i, a ^ b ^ c ^ (a & b));
-    }
-}
-
-void LowMCEnc(const uint32_t* plaintext, uint32_t* output, uint32_t* key, paramset_t* params)
-{
-    uint32_t roundKey[LOWMC_MAX_STATE_SIZE / sizeof(uint32_t)];
-
-    if (plaintext != output) {
-        /* output will hold the intermediate state */
-        memcpy(output, plaintext, params->stateSizeBytes);
-    }
-
-    matrix_mul(key, KMatrix(0, params), roundKey, params);
-    xor_array(output, roundKey, output, params->stateSizeWords);
-
-    uint32_t r;
-    for (r = 1; r <= params->numRounds; r++) {
-        matrix_mul(key, KMatrix(r, params), roundKey, params);
-        substitution(output, params);
-        matrix_mul(output, LMatrix(r - 1, params), output, params);
-        xor_array(output, RConstant(r - 1, params), output, params->stateSizeWords);
-        xor_array(output, roundKey, output, params->stateSizeWords);
-    }
-
-}
-
-bool createRandomTape(const uint8_t* seed, uint8_t* tape,
-                      uint32_t tapeLengthBytes, paramset_t* params)
-{
-    HashInstance ctx;
-
-    if (tapeLengthBytes < params->digestSizeBytes) {
-        return false;
-    }
-
-    /* Hash the seed and a constant, store the result in tape. */
-    HashInit(&ctx, params, HASH_PREFIX_2);
-    HashUpdate(&ctx, seed, params->seedSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, tape, params->digestSizeBytes);
-
-    /* Expand the hashed seed and output length to create the tape. */
-    HashInit(&ctx, params, HASH_PREFIX_NONE);
-    HashUpdate(&ctx, tape, params->digestSizeBytes);
-    uint16_t outputBytesLE = toLittleEndian(tapeLengthBytes);
-    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, tape, tapeLengthBytes);
-
-    return true;
-}
-
-void mpc_xor(uint32_t* state[3], uint32_t* in[3], uint32_t len, int players)
-{
-    uint8_t i;
-    for (i = 0; i < players; i++) {
-        xor_array(state[i], in[i], state[i], len);
-    }
-}
-
-/* Compute the XOR of in with the first state vectors. */
-void mpc_xor_constant(uint32_t* state[3], const uint32_t* in, uint32_t len)
-{
-    xor_array(state[0], in, state[0], len);
-}
-
-void mpc_xor_constant_verify(uint32_t* state[2], const uint32_t* in, uint32_t len, uint8_t challenge)
-{
-    /* During verify, where the first share is stored in state depends on the challenge */
-    if (challenge == 0) {
-        xor_array(state[0], in, state[0], len);
-    }
-    else if (challenge == 2) {
-        xor_array(state[1], in, state[1], len);
-    }
-}
-
-
-void Commit(const uint8_t* seed, const view_t view,
-            uint8_t* hash, paramset_t* params)
-{
-    HashInstance ctx;
-
-    /* Hash the seed, store result in `hash` */
-    HashInit(&ctx, params, HASH_PREFIX_4);
-    HashUpdate(&ctx, seed, params->seedSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, hash, params->digestSizeBytes);
-
-    /* Compute H_0(H_4(seed), view) */
-    HashInit(&ctx, params, HASH_PREFIX_0);
-    HashUpdate(&ctx, hash, params->digestSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)view.inputShare, params->stateSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)view.communicatedBits, params->andSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)view.outputShare, params->stateSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, hash, params->digestSizeBytes);
-}
-
-/* This is the random "permuatation" function G for Unruh's transform */
-void G(uint8_t viewNumber, const uint8_t* seed, view_t* view, uint8_t* output, paramset_t* params)
-{
-    HashInstance ctx;
-    uint16_t outputBytes = params->seedSizeBytes + params->andSizeBytes;
-
-    /* Hash the seed with H_5, store digest in output */
-    HashInit(&ctx, params, HASH_PREFIX_5);
-    HashUpdate(&ctx, seed, params->seedSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, output, params->digestSizeBytes);
-
-    /* Hash H_5(seed), the view, and the length */
-    HashInit(&ctx, params, HASH_PREFIX_NONE);
-    HashUpdate(&ctx, output, params->digestSizeBytes);
-    if (viewNumber == 2) {
-        HashUpdate(&ctx, (uint8_t*)view->inputShare, params->stateSizeBytes);
-        outputBytes += (uint16_t)params->stateSizeBytes;
-    }
-    HashUpdate(&ctx, view->communicatedBits, params->andSizeBytes);
-
-    uint16_t outputBytesLE = toLittleEndian(outputBytes);
-    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, output, outputBytes);
-}
-
-void setChallenge(uint8_t* challenge, size_t round, uint8_t trit)
-{
-    /* challenge must have length numBytes(numZKBRounds*2)
-     * 0 <= index < numZKBRounds
-     * trit must be in {0,1,2} */
-    uint32_t roundU32 = (uint32_t)round;
-
-    setBit(challenge, 2 * roundU32, trit & 1);
-    setBit(challenge, 2 * roundU32 + 1, (trit >> 1) & 1);
-}
-
-uint8_t getChallenge(const uint8_t* challenge, size_t round)
-{
-    uint32_t roundU32 = (uint32_t)round;
-
-    return (getBit(challenge, 2 * roundU32 + 1) << 1) | getBit(challenge, 2 * roundU32);
-}
-
-void H3(const uint32_t* circuitOutput, const uint32_t* plaintext, uint32_t** viewOutputs,
-        commitments_t* as,
-        uint8_t* challengeBits, const uint8_t* message, size_t messageByteLength,
-        g_commitments_t* gs, paramset_t* params)
-{
-    uint8_t* hash = malloc(params->digestSizeBytes);
-
-    HashInstance ctx;
-
-    /* Depending on the number of rounds, we might not set part of the last
-     * byte, make sure it's always zero. */
-    challengeBits[numBytes(params->numZKBRounds * 2) - 1] = 0;
-
-    /* Hash input data */
-    HashInit(&ctx, params, HASH_PREFIX_1);
-
-    /* Hash the output share from each view */
-    uint32_t i;
-    int j;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        for (j = 0; j < 3; j++) {
-            HashUpdate(&ctx, (uint8_t*)VIEW_OUTPUTS(i, j), params->stateSizeBytes);
-        }
-    }
-
-    /* Hash all the commitments C */
-    for (i = 0; i < params->numZKBRounds; i++) {
-        for (j = 0; j < 3; j++) {
-            HashUpdate(&ctx, as[i].hashes[j], params->digestSizeBytes);
-        }
-    }
-
-    /* Hash all the commitments G */
-    if (params->transform == TRANSFORM_UR) {
-        for (i = 0; i < params->numZKBRounds; i++) {
-            for (j = 0; j < 3; j++) {
-                size_t view3UnruhLength = (j == 2) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-                HashUpdate(&ctx, gs[i].G[j], view3UnruhLength);
-            }
-        }
-    }
-
-    HashUpdate(&ctx, (uint8_t*)circuitOutput, params->stateSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
-    HashUpdate(&ctx, message, messageByteLength);
-
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, hash, params->digestSizeBytes);
-
-    /* Convert hash to a packed string of values in {0,1,2} */
-    size_t byte_count, round = 0;
-    while (1) {
-        for (byte_count = 0; byte_count < params->digestSizeBytes; byte_count++) {
-            uint8_t byte = hash[byte_count];
-            /* iterate over each pair of bits in the byte */
-            for (j = 0; j < 8; j += 2) {
-                uint8_t bitPair = ((byte >> (6 - j)) & 0x03);
-                if (bitPair < 3) {
-                    setChallenge(challengeBits, round, bitPair);
-                    round++;
-                    if (round == params->numZKBRounds) {
-                        goto done;
-                    }
-                }
-            }
-        }
-
-        /* We need more bits; hash set hash = H_1(hash) */
-        HashInit(&ctx, params, HASH_PREFIX_1);
-        HashUpdate(&ctx, hash, params->digestSizeBytes);
-        HashFinal(&ctx);
-        HashSqueeze(&ctx, hash, params->digestSizeBytes);
-    }
-
-done:
-
-    free(hash);
-    return;
-}
-
-/* Caller must allocate the first parameter */
-void prove(proof_t* proof, uint8_t challenge, seeds_t* seeds,
-           view_t views[3], commitments_t* commitments, g_commitments_t* gs, paramset_t* params)
-{
-    if (challenge == 0) {
-        memcpy(proof->seed1, seeds->seed0, params->seedSizeBytes);
-        memcpy(proof->seed2, seeds->seed1, params->seedSizeBytes);
-    }
-    else if (challenge == 1) {
-        memcpy(proof->seed1, seeds->seed1, params->seedSizeBytes);
-        memcpy(proof->seed2, seeds->seed2, params->seedSizeBytes);
-    }
-    else if (challenge == 2) {
-        memcpy(proof->seed1, seeds->seed2, params->seedSizeBytes);
-        memcpy(proof->seed2, seeds->seed0, params->seedSizeBytes);
-    }
-    else {
-        assert(!"Invalid challenge");
-    }
-
-    if (challenge == 1 || challenge == 2) {
-        memcpy(proof->inputShare, views[2].inputShare, params->stateSizeBytes);
-    }
-    memcpy(proof->communicatedBits, views[(challenge + 1) % 3].communicatedBits, params->andSizeBytes);
-
-    memcpy(proof->view3Commitment, commitments->hashes[(challenge + 2) % 3], params->digestSizeBytes);
-    if (params->transform == TRANSFORM_UR) {
-        size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-        memcpy(proof->view3UnruhG, gs->G[(challenge + 2) % 3], view3UnruhLength);
-    }
-}
-
-void mpc_AND_verify(uint8_t in1[2], uint8_t in2[2], uint8_t out[2],
-                    randomTape_t* rand, view_t* view1, view_t* view2)
-{
-    uint8_t r[2] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos) };
-
-    out[0] = (in1[0] & in2[1]) ^ (in1[1] & in2[0]) ^ (in1[0] & in2[0]) ^ r[0] ^ r[1];
-    setBit(view1->communicatedBits, rand->pos, out[0]);
-    out[1] = getBit(view2->communicatedBits, rand->pos);
-
-    (rand->pos)++;
-}
-
-void mpc_substitution_verify(uint32_t* state[2], randomTape_t* rand, view_t* view1,
-                             view_t* view2, paramset_t* params)
-{
-    uint32_t i;
-    for (i = 0; i < params->numSboxes * 3; i += 3) {
-
-        uint8_t a[2];
-        uint8_t b[2];
-        uint8_t c[2];
-
-        uint8_t j;
-        for (j = 0; j < 2; j++) {
-            a[j] = getBitFromWordArray(state[j], i + 2);
-            b[j] = getBitFromWordArray(state[j], i + 1);
-            c[j] = getBitFromWordArray(state[j], i);
-        }
-
-        uint8_t ab[2];
-        uint8_t bc[2];
-        uint8_t ca[2];
-
-        mpc_AND_verify(a, b, ab, rand, view1, view2);
-        mpc_AND_verify(b, c, bc, rand, view1, view2);
-        mpc_AND_verify(c, a, ca, rand, view1, view2);
-
-        for (j = 0; j < 2; j++) {
-            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
-            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
-            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
-        }
-    }
-}
-
-void mpc_matrix_mul(uint32_t* state[3], const uint32_t* matrix,
-                    uint32_t* output[3], paramset_t* params, size_t players)
-{
-    uint32_t player;
-    for (player = 0; player < players; player++) {
-        matrix_mul(state[player], matrix, output[player], params);
-    }
-}
-
-void mpc_LowMC_verify(view_t* view1, view_t* view2,
-                      randomTape_t* tapes, uint32_t* tmp,
-                      const uint32_t* plaintext, paramset_t* params, uint8_t challenge)
-{
-    uint32_t* state[2];
-    uint32_t* keyShares[2];
-    uint32_t* roundKey[2];
-
-    roundKey[0] = tmp;
-    roundKey[1] = roundKey[0] + params->stateSizeWords;
-    state[0] = roundKey[1] + params->stateSizeWords;
-    state[1] = state[0] + params->stateSizeWords;
-
-    // initialize both roundkeys to 0. they are contingent
-    memset(roundKey[0], 0, 2 * params->stateSizeBytes);
-
-    uint32_t i, r;
-    for (i = 0; i < 2; i++) {
-        memset(state[i], 0x00, params->stateSizeBytes);
-    }
-    mpc_xor_constant_verify(state, plaintext, params->stateSizeWords, challenge);
-
-    keyShares[0] = view1->inputShare;
-    keyShares[1] = view2->inputShare;
-
-    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 2);
-    mpc_xor(state, roundKey, params->stateSizeWords, 2);
-
-    for (r = 1; r <= params->numRounds; ++r) {
-        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 2);
-        mpc_substitution_verify(state, tapes, view1, view2, params);
-        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 2);
-        mpc_xor_constant_verify(state, RConstant(r - 1, params), params->stateSizeWords, challenge);
-        mpc_xor(state, roundKey, params->stateSizeWords, 2);
-    }
-
-    memcpy(view1->outputShare, state[0], params->stateSizeBytes);
-    memcpy(view2->outputShare, state[1], params->stateSizeBytes);
-}
-
-void verifyProof(const proof_t* proof, view_t* view1, view_t* view2,
-                 uint8_t challenge, uint8_t* tmp,
-                 const uint32_t* plaintext, randomTape_t* tape, paramset_t* params)
-{
-    memcpy(view2->communicatedBits, proof->communicatedBits, params->andSizeBytes);
-    tape->pos = 0;
-
-    bool status = false;
-    switch (challenge) {
-    case 0:
-        // in this case, both views' inputs are derivable from the input share
-
-        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
-        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        if (!status) {
-            break;
-        }
-        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
-        break;
-
-    case 1:
-        // in this case view2's input share was already given to us explicitly as
-        // it is not computable from the seed. We just need to compute view1's input from
-        // its seed
-        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
-        status = status && createRandomTape(proof->seed2, tape->tape[1], params->andSizeBytes, params);
-        if (!status) {
-            break;
-        }
-        memcpy(view2->inputShare, proof->inputShare, params->stateSizeBytes);
-        break;
-
-    case 2:
-        // in this case view1's input share was already given to us explicitly as
-        // it is not computable from the seed. We just need to compute view2's input from
-        // its seed
-        status = createRandomTape(proof->seed1, tape->tape[0], params->andSizeBytes, params);
-        memcpy(view1->inputShare, proof->inputShare, params->stateSizeBytes);
-        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        if (!status) {
-            break;
-        }
-        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
-        break;
-
-    default:
-        break;
-    }
-
-    mpc_LowMC_verify(view1, view2, tape, (uint32_t*)tmp, plaintext, params, challenge);
-}
-
-int verify(signature_t* sig, const uint32_t* pubKey, const uint32_t* plaintext,
-           const uint8_t* message, size_t messageByteLength, paramset_t* params)
-{
-    commitments_t* as = allocateCommitments(params);
-    g_commitments_t* gs = allocateGCommitments(params);
-
-    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
-    const proof_t* proofs = sig->proofs;
-
-    const uint8_t* received_challengebits = sig->challengeBits;
-    int status = EXIT_SUCCESS;
-    uint8_t* computed_challengebits = NULL;
-    uint32_t* view3Slab = NULL;
-
-    uint8_t* tmp = malloc(MAX(6 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
-
-    randomTape_t* tape = (randomTape_t*)malloc(sizeof(randomTape_t));
-
-    allocateRandomTape(tape, params);
-
-    view_t* view1s = malloc(params->numZKBRounds * sizeof(view_t));
-    view_t* view2s = malloc(params->numZKBRounds * sizeof(view_t));
-
-    /* Allocate a slab of memory for the 3rd view's output in each round */
-    view3Slab = malloc(params->stateSizeBytes * params->numZKBRounds);
-    uint32_t* view3Output = view3Slab;     /* pointer into the slab to the current 3rd view */
-
-    size_t i, j;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        allocateView(&view1s[i], params);
-        allocateView(&view2s[i], params);
-
-        // last bits of communicatedBits may not be set so zero them
-        view1s[i].communicatedBits[params->andSizeBytes - 1] = 0;
-
-        verifyProof(&proofs[i], &view1s[i], &view2s[i],
-                    getChallenge(received_challengebits, i),
-                    tmp, plaintext, tape, params);
-
-        // create ordered array of commitments with order computed based on the challenge
-        // check commitments of the two opened views
-        uint8_t challenge = getChallenge(received_challengebits, i);
-        Commit(proofs[i].seed1, view1s[i], as[i].hashes[challenge], params);
-        Commit(proofs[i].seed2, view2s[i], as[i].hashes[(challenge + 1) % 3], params);
-        memcpy(as[i].hashes[(challenge + 2) % 3], proofs[i].view3Commitment, params->digestSizeBytes);
-
-        if (params->transform == TRANSFORM_UR) {
-            G(challenge, proofs[i].seed1, &view1s[i], gs[i].G[challenge], params);
-            G((challenge + 1) % 3, proofs[i].seed2, &view2s[i], gs[i].G[(challenge + 1) % 3], params);
-            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-            memcpy(gs[i].G[(challenge + 2) % 3], proofs[i].view3UnruhG, view3UnruhLength);
-        }
-
-        VIEW_OUTPUTS(i, challenge) = view1s[i].outputShare;
-        VIEW_OUTPUTS(i, (challenge + 1) % 3) = view2s[i].outputShare;
-        for (j = 0; j < params->stateSizeWords; j++) {
-            view3Output[j] = view1s[i].outputShare[j] ^ view2s[i].outputShare[j]
-                             ^ pubKey[j];
-        }
-        VIEW_OUTPUTS(i, (challenge + 2) % 3) = view3Output;
-        view3Output += params->stateSizeWords;
-    }
-
-    computed_challengebits = malloc(numBytes(2 * params->numZKBRounds));
-
-    H3(pubKey, plaintext, viewOutputs, as,
-       computed_challengebits, message, messageByteLength, gs, params);
-
-    if (computed_challengebits != NULL &&
-        memcmp(received_challengebits, computed_challengebits,
-               numBytes(2 * params->numZKBRounds)) != 0) {
-        status = EXIT_FAILURE;
-    }
-
-    free(computed_challengebits);
-    free(view3Slab);
-
-    freeCommitments(as);
-    for (i = 0; i < params->numZKBRounds; i++) {
-        freeView(&view1s[i]);
-        freeView(&view2s[i]);
-    }
-    free(view1s);
-    free(view2s);
-    free(tmp);
-    freeRandomTape(tape);
-    free(tape);
-    freeGCommitments(gs);
-    free(viewOutputs);
-
-    return status;
-}
-
-/*** Functions implementing Sign ***/
-
-void mpc_AND(uint8_t in1[3], uint8_t in2[3], uint8_t out[3], randomTape_t* rand,
-             view_t views[3])
-{
-    uint8_t r[3] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos), getBit(rand->tape[2], rand->pos) };
-
-    uint8_t i;
-    for (i = 0; i < 3; i++) {
-        out[i] = (in1[i] & in2[(i + 1) % 3]) ^ (in1[(i + 1) % 3] & in2[i])
-                 ^ (in1[i] & in2[i]) ^ r[i] ^ r[(i + 1) % 3];
-
-        setBit(views[i].communicatedBits, rand->pos, out[i]);
-    }
-
-    (rand->pos)++;
-}
-
-void mpc_substitution(uint32_t* state[3], randomTape_t* rand, view_t views[3],
-                      paramset_t* params)
-{
-    uint8_t a[3];
-    uint8_t b[3];
-    uint8_t c[3];
-
-    uint8_t ab[3];
-    uint8_t bc[3];
-    uint8_t ca[3];
-
-    uint32_t i;
-    for (i = 0; i < params->numSboxes * 3; i += 3) {
-
-        uint8_t j;
-        for (j = 0; j < 3; j++) {
-            a[j] = getBitFromWordArray(state[j], i + 2);
-            b[j] = getBitFromWordArray(state[j], i + 1);
-            c[j] = getBitFromWordArray(state[j], i);
-        }
-
-        mpc_AND(a, b, ab, rand, views);
-        mpc_AND(b, c, bc, rand, views);
-        mpc_AND(c, a, ca, rand, views);
-
-        for (j = 0; j < 3; j++) {
-            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
-            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
-            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
-        }
-    }
-}
-
-void mpc_LowMC(randomTape_t* tapes, view_t views[3],
-               const uint32_t* plaintext, uint32_t* slab, paramset_t* params)
-{
-    uint32_t* keyShares[3];
-    uint32_t* state[3];
-    uint32_t* roundKey[3];
-
-    roundKey[0] = slab;
-    roundKey[1] = slab + params->stateSizeWords;
-    roundKey[2] = roundKey[1] + params->stateSizeWords;
-    state[0] = roundKey[2] + params->stateSizeWords;
-    state[1] = state[0] + params->stateSizeWords;
-    state[2] = state[1] + params->stateSizeWords;
-
-    memset(roundKey[0], 0, 3 * params->stateSizeBytes);
-    int i;
-    for (i = 0; i < 3; i++) {
-        keyShares[i] = views[i].inputShare;
-        memset(state[i], 0x00, params->stateSizeBytes);
-    }
-    mpc_xor_constant(state, plaintext, params->stateSizeWords);
-
-    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 3);
-    mpc_xor(state, roundKey, params->stateSizeWords, 3);
-
-    uint32_t r;
-    for (r = 1; r <= params->numRounds; r++) {
-        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 3);
-        mpc_substitution(state, tapes, views, params);
-        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 3);
-        mpc_xor_constant(state, RConstant(r - 1, params), params->stateSizeWords);
-        mpc_xor(state, roundKey, params->stateSizeWords, 3);
-    }
-
-    for (i = 0; i < 3; i++) {
-        memcpy(views[i].outputShare, state[i], params->stateSizeBytes);
-    }
-
-}
-
-void runMPC(view_t views[3], randomTape_t* rand,
-            uint32_t* plaintext, uint32_t* slab, paramset_t* params)
-{
-    rand->pos = 0;
-    mpc_LowMC(rand, views, plaintext, slab, params);
-}
-
-
-seeds_t* computeSeeds(uint32_t* privateKey, uint32_t*
-                      publicKey, uint32_t* plaintext, const uint8_t* message, size_t messageByteLength, paramset_t* params)
-{
-    HashInstance ctx;
-    seeds_t* allSeeds = allocateSeeds(params);
-
-    HashInit(&ctx, params, HASH_PREFIX_NONE);
-    HashUpdate(&ctx, (uint8_t*)privateKey, params->stateSizeBytes);
-    HashUpdate(&ctx, message, messageByteLength);
-    HashUpdate(&ctx, (uint8_t*)publicKey, params->stateSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
-    uint16_t stateSizeBitsLE = toLittleEndian((uint16_t)params->stateSizeBits);
-    HashUpdate(&ctx, ((uint8_t*)&stateSizeBitsLE), sizeof(uint16_t));
-    HashFinal(&ctx);
-
-    HashSqueeze(&ctx, getSeed(allSeeds, 0, 0), params->seedSizeBytes * 3 * params->numZKBRounds);
-
-    return allSeeds;
-}
-
-int sign(uint32_t* privateKey, uint32_t* pubKey, uint32_t* plaintext, const uint8_t* message,
-         size_t messageByteLength, signature_t* sig, paramset_t* params)
-{
-    bool status;
-
-    /* Allocate views and commitments for all parallel iterations */
-    view_t** views = allocateViews(params);
-    commitments_t* as = allocateCommitments(params);
-    g_commitments_t* gs = allocateGCommitments(params);
-
-    /* Compute seeds for all parallel iterations */
-    seeds_t* seeds = computeSeeds(privateKey, pubKey, plaintext, message, messageByteLength, params);
-
-    //Allocate a random tape (re-used per parallel iteration), and a temporary buffer
-    randomTape_t tape;
-
-    allocateRandomTape(&tape, params);
-    uint8_t* tmp = malloc( MAX(9 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
-
-    uint32_t k;
-    for (k = 0; k < params->numZKBRounds; k++) {
-        // for first two players get all tape INCLUDING INPUT SHARE from seed
-        int j;
-        for (j = 0; j < 2; j++) {
-            status = createRandomTape(getSeed(seeds, k, j), tmp, params->stateSizeBytes + params->andSizeBytes, params);
-            if (!status) {
-                return EXIT_FAILURE;
-            }
-
-            memcpy(views[k][j].inputShare, tmp, params->stateSizeBytes);
-            memcpy(tape.tape[j], tmp + params->stateSizeBytes, params->andSizeBytes);
-        }
-        // Now set third party's wires. The random bits are from the seed, the input is
-        // the XOR of other two inputs and the private key
-        status = createRandomTape(getSeed(seeds, k, 2), tape.tape[2], params->andSizeBytes, params);
-        if (!status) {
-            return EXIT_FAILURE;
-        }
-        uint32_t j1;
-        for (j1 = 0; j1 < params->stateSizeWords; j1++) {
-            views[k][2].inputShare[j1] = privateKey[j1]
-                                        ^ views[k][0].inputShare[j1]
-                                        ^ views[k][1].inputShare[j1];
-        }
-
-        runMPC(views[k], &tape, plaintext, (uint32_t*)tmp, params);
-
-        //Committing
-        Commit(getSeed(seeds, k, 0), views[k][0], as[k].hashes[0], params);
-        Commit(getSeed(seeds, k, 1), views[k][1], as[k].hashes[1], params);
-        Commit(getSeed(seeds, k, 2), views[k][2], as[k].hashes[2], params);
-
-        if (params->transform == TRANSFORM_UR) {
-            G(0, getSeed(seeds, k, 0), &views[k][0], gs[k].G[0], params);
-            G(1, getSeed(seeds, k, 1), &views[k][1], gs[k].G[1], params);
-            G(2, getSeed(seeds, k, 2), &views[k][2], gs[k].G[2], params);
-        }
-    }
-
-    //Generating challenges
-    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
-
-    size_t ii, jj;
-    for (ii = 0; ii < params->numZKBRounds; ii++)
-        for (jj = 0; jj < 3; jj++)
-            VIEW_OUTPUTS(ii, jj) = views[ii][jj].outputShare;
-
-
-    uint32_t output[LOWMC_MAX_STATE_SIZE];
-    uint32_t j;
-    for (j = 0; j < params->stateSizeWords; j++)
-        output[j] = (VIEW_OUTPUTS(0, 0))[j] ^ (VIEW_OUTPUTS(0, 1))[j] ^ (VIEW_OUTPUTS(0, 2))[j];
-
-
-    H3(output, plaintext, viewOutputs, as,
-       sig->challengeBits, message, messageByteLength, gs, params);
-
-    //Packing Z
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        proof_t* proof = &sig->proofs[i];
-        prove(proof, getChallenge(sig->challengeBits, i), &seeds[i],
-              views[i], &as[i], (gs == NULL) ? NULL : &gs[i], params);
-    }
-
-    free(tmp);
-
-    freeViews(views, params);
-    freeCommitments(as);
-    freeRandomTape(&tape);
-    freeGCommitments(gs);
-    free(viewOutputs);
-    freeSeeds(seeds);
-
-    return EXIT_SUCCESS;
-}
-
-/*** Serialization functions ***/
-
-int serializeSignature(const signature_t* sig, uint8_t* sigBytes, size_t sigBytesLen, paramset_t* params)
-{
-    const proof_t* proofs = sig->proofs;
-    const uint8_t* challengeBits = sig->challengeBits;
-
-    /* Validate input buffer is large enough */
-    size_t bytesRequired = numBytes(2 * params->numZKBRounds) +
-                           params->numZKBRounds * (2 * params->seedSizeBytes + params->stateSizeBytes + params->andSizeBytes + params->digestSizeBytes);
-
-    if (params->transform == TRANSFORM_UR) {
-        bytesRequired += params->UnruhGWithoutInputBytes * params->numZKBRounds;
-    }
-
-    if (sigBytesLen < bytesRequired) {
-        return -1;
-    }
-
-    uint8_t* sigBytesBase = sigBytes;
-
-    memcpy(sigBytes, challengeBits, numBytes(2 * params->numZKBRounds));
-    sigBytes += numBytes(2 * params->numZKBRounds);
-
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-
-        uint8_t challenge = getChallenge(challengeBits, i);
-
-        memcpy(sigBytes, proofs[i].view3Commitment, params->digestSizeBytes);
-        sigBytes += params->digestSizeBytes;
-
-        if (params->transform == TRANSFORM_UR) {
-            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-            memcpy(sigBytes, proofs[i].view3UnruhG, view3UnruhLength);
-            sigBytes += view3UnruhLength;
-        }
-
-        memcpy(sigBytes, proofs[i].communicatedBits, params->andSizeBytes);
-        sigBytes += params->andSizeBytes;
-
-        memcpy(sigBytes, proofs[i].seed1, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        memcpy(sigBytes, proofs[i].seed2, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        if (challenge == 1 || challenge == 2) {
-            memcpy(sigBytes, proofs[i].inputShare, params->stateSizeBytes);
-            sigBytes += params->stateSizeBytes;
-        }
-
-
-    }
-
-    return (int)(sigBytes - sigBytesBase);
-}
-
-
-static size_t computeInputShareSize(const uint8_t* challengeBits, size_t stateSizeBytes, paramset_t* params)
-{
-    /* When the FS transform is used, the input share is included in the proof
-     * only when the challenge is 1 or 2.  When dersializing, to compute the
-     * number of bytes expected, we must check how many challenge values are 1
-     * or 2. The parameter stateSizeBytes is the size of an input share. */
-    size_t inputShareSize = 0;
-
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        uint8_t challenge = getChallenge(challengeBits, i);
-        if (challenge == 1 || challenge == 2) {
-            inputShareSize += stateSizeBytes;
-        }
-    }
-    return inputShareSize;
-}
-
-int deserializeSignature(signature_t* sig, const uint8_t* sigBytes,
-                         size_t sigBytesLen, paramset_t* params)
-{
-    proof_t* proofs = sig->proofs;
-    uint8_t* challengeBits = sig->challengeBits;
-
-    /* Validate input buffer is large enough */
-    if (sigBytesLen < numBytes(2 * params->numZKBRounds)) {     /* ensure the input has at least the challenge */
-        return EXIT_FAILURE;
-    }
-    size_t inputShareSize = computeInputShareSize(sigBytes, params->stateSizeBytes, params);
-    size_t bytesExpected = numBytes(2 * params->numZKBRounds) +
-                           params->numZKBRounds * (2 * params->seedSizeBytes + params->andSizeBytes + params->digestSizeBytes) + inputShareSize;
-    if (params->transform == TRANSFORM_UR) {
-        bytesExpected += params->UnruhGWithoutInputBytes * params->numZKBRounds;
-    }
-    if (sigBytesLen < bytesExpected) {
-        return EXIT_FAILURE;
-    }
-
-    memcpy(challengeBits, sigBytes, numBytes(2 * params->numZKBRounds));
-    sigBytes += numBytes(2 * params->numZKBRounds);
-
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-
-        uint8_t challenge = getChallenge(challengeBits, i);
-
-        memcpy(proofs[i].view3Commitment, sigBytes, params->digestSizeBytes);
-        sigBytes += params->digestSizeBytes;
-
-        if (params->transform == TRANSFORM_UR) {
-            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-            memcpy(proofs[i].view3UnruhG, sigBytes, view3UnruhLength);
-            sigBytes += view3UnruhLength;
-        }
-
-        memcpy(proofs[i].communicatedBits, sigBytes, params->andSizeBytes);
-        sigBytes += params->andSizeBytes;
-
-        memcpy(proofs[i].seed1, sigBytes, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        memcpy(proofs[i].seed2, sigBytes, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        if (challenge == 1 || challenge == 2) {
-            memcpy(proofs[i].inputShare, sigBytes, params->stateSizeBytes);
-            sigBytes += params->stateSizeBytes;
-        }
-
-    }
-
-    return EXIT_SUCCESS;
-}
-
-
-
-
+/*! @file picnic_impl.c
+ *  @brief This is the main file of the signature scheme. All of the LowMC MPC
+ *  code is here as well as lower-level versions of sign and verify that are
+ *  called by the signature API.
+ *
+ *  This file is part of the reference implementation of the Picnic signature scheme.
+ *  See the accompanying documentation for complete details.
+ *
+ *  The code is provided under the MIT license, see LICENSE for
+ *  more details.
+ *  SPDX-License-Identifier: MIT
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#if defined (__WIN32)
+	#include <windows.h>
+	#include <bcrypt.h>
+#elif defined (__APPLE__)
+    #include "macos_specific_endian.h"
+#elif defined (DAP_OS_LINUX)
+    #include <endian.h>
+#endif
+
+#include "picnic_impl.h"
+#include "picnic.h"
+#include "platform.h"
+#include "lowmc_constants.h"
+#include "hash.h"
+#include "picnic_types.h"
+#include "dap_common.h"
+
+
+#define VIEW_OUTPUTS(i, j) viewOutputs[(i) * 3 + (j)]
+
+
+/* Helper functions */
+uint16_t toLittleEndian(uint16_t x)
+{
+#if defined(__WIN32)
+    #if BYTE_ORDER == LITTLE_ENDIAN
+		return x;
+	#else
+		return __builtin_bswap16(x);
+    #endif
+#else
+    return htole16(x);
+#endif
+}
+
+/* Get one bit from a byte array */
+uint8_t getBit(const uint8_t* array, uint32_t bitNumber)
+{
+    return (array[bitNumber / 8] >> (7 - (bitNumber % 8))) & 0x01;
+}
+
+/* Get one bit from a 32-bit int array */
+uint8_t getBitFromWordArray(const uint32_t* array, uint32_t bitNumber)
+{
+    return getBit((uint8_t*)array, bitNumber);
+}
+
+/* Set a specific bit in a byte array to a given value */
+void setBit(uint8_t* bytes, uint32_t bitNumber, uint8_t val)
+{
+    bytes[bitNumber / 8] = (bytes[bitNumber >> 3]
+                            & ~(1 << (7 - (bitNumber % 8)))) | (val << (7 - (bitNumber % 8)));
+}
+
+/* Set a specific bit in a byte array to a given value */
+void setBitInWordArray(uint32_t* array, uint32_t bitNumber, uint8_t val)
+{
+    setBit((uint8_t*)array, bitNumber, val);
+}
+
+static uint8_t parity(uint32_t* data, size_t len)
+{
+    uint32_t x = data[0];
+    size_t i;
+    for (i = 1; i < len; i++) {
+        x ^= data[i];
+    }
+
+    /* Compute parity of x using code from Section 5-2 of
+     * H.S. Warren, *Hacker's Delight*, Pearson Education, 2003.
+     * http://www.hackersdelight.org/hdcodetxt/parity.c.txt
+     */
+    uint32_t y = x ^ (x >> 1);
+    y ^= (y >> 2);
+    y ^= (y >> 4);
+    y ^= (y >> 8);
+    y ^= (y >> 16);
+    return y & 1;
+}
+
+uint32_t numBytes(uint32_t numBits)
+{
+    return (numBits == 0) ? 0 : ((numBits - 1) / 8 + 1);
+}
+
+static void xor_array(const uint32_t * in1, const uint32_t * in2, uint32_t * out, uint32_t numBytes)
+{
+    uint32_t i;
+    for (i = 0; i < numBytes; i++) {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+static void matrix_mul(
+    uint32_t* state,
+    const uint32_t* matrix,
+    uint32_t* output,
+    paramset_t* params)
+{
+    // Use temp to correctly handle the case when state = output
+    uint32_t prod[LOWMC_MAX_STATE_SIZE];
+    uint32_t temp[LOWMC_MAX_STATE_SIZE];
+
+    uint32_t i, j;
+    for (i = 0; i < params->stateSizeBits; i++) {
+        for (j = 0; j < params->stateSizeWords; j++) {
+            size_t index = i * params->stateSizeWords + j;
+            prod[j] = (state[j] & matrix[index]);
+        }
+        setBit((uint8_t*)temp, i, parity(&prod[0], params->stateSizeWords));
+
+    }
+    memcpy(output, &temp, params->stateSizeWords * sizeof(uint32_t));
+}
+
+static void substitution(uint32_t* state, paramset_t* params)
+{
+    uint32_t i;
+    for (i = 0; i < params->numSboxes * 3; i += 3) {
+        uint8_t a = getBitFromWordArray(state, i + 2);
+        uint8_t b = getBitFromWordArray(state, i + 1);
+        uint8_t c = getBitFromWordArray(state, i);
+
+        setBitInWordArray(state, i + 2, a ^ (b & c));
+        setBitInWordArray(state, i + 1, a ^ b ^ (a & c));
+        setBitInWordArray(state, i, a ^ b ^ c ^ (a & b));
+    }
+}
+
+void LowMCEnc(const uint32_t* plaintext, uint32_t* output, uint32_t* key, paramset_t* params)
+{
+    uint32_t roundKey[LOWMC_MAX_STATE_SIZE / sizeof(uint32_t)];
+
+    if (plaintext != output) {
+        /* output will hold the intermediate state */
+        memcpy(output, plaintext, params->stateSizeBytes);
+    }
+
+    matrix_mul(key, KMatrix(0, params), roundKey, params);
+    xor_array(output, roundKey, output, params->stateSizeWords);
+
+    uint32_t r;
+    for (r = 1; r <= params->numRounds; r++) {
+        matrix_mul(key, KMatrix(r, params), roundKey, params);
+        substitution(output, params);
+        matrix_mul(output, LMatrix(r - 1, params), output, params);
+        xor_array(output, RConstant(r - 1, params), output, params->stateSizeWords);
+        xor_array(output, roundKey, output, params->stateSizeWords);
+    }
+
+}
+
+bool createRandomTape(const uint8_t* seed, uint8_t* tape,
+                      uint32_t tapeLengthBytes, paramset_t* params)
+{
+    HashInstance ctx;
+
+    if (tapeLengthBytes < params->digestSizeBytes) {
+        return false;
+    }
+
+    /* Hash the seed and a constant, store the result in tape. */
+    HashInit(&ctx, params, HASH_PREFIX_2);
+    HashUpdate(&ctx, seed, params->seedSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, tape, params->digestSizeBytes);
+
+    /* Expand the hashed seed and output length to create the tape. */
+    HashInit(&ctx, params, HASH_PREFIX_NONE);
+    HashUpdate(&ctx, tape, params->digestSizeBytes);
+    uint16_t outputBytesLE = toLittleEndian(tapeLengthBytes);
+    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, tape, tapeLengthBytes);
+
+    return true;
+}
+
+void mpc_xor(uint32_t* state[3], uint32_t* in[3], uint32_t len, int players)
+{
+    uint8_t i;
+    for (i = 0; i < players; i++) {
+        xor_array(state[i], in[i], state[i], len);
+    }
+}
+
+/* Compute the XOR of in with the first state vectors. */
+void mpc_xor_constant(uint32_t* state[3], const uint32_t* in, uint32_t len)
+{
+    xor_array(state[0], in, state[0], len);
+}
+
+void mpc_xor_constant_verify(uint32_t* state[2], const uint32_t* in, uint32_t len, uint8_t challenge)
+{
+    /* During verify, where the first share is stored in state depends on the challenge */
+    if (challenge == 0) {
+        xor_array(state[0], in, state[0], len);
+    }
+    else if (challenge == 2) {
+        xor_array(state[1], in, state[1], len);
+    }
+}
+
+
+void Commit(const uint8_t* seed, const view_t view,
+            uint8_t* hash, paramset_t* params)
+{
+    HashInstance ctx;
+
+    /* Hash the seed, store result in `hash` */
+    HashInit(&ctx, params, HASH_PREFIX_4);
+    HashUpdate(&ctx, seed, params->seedSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, hash, params->digestSizeBytes);
+
+    /* Compute H_0(H_4(seed), view) */
+    HashInit(&ctx, params, HASH_PREFIX_0);
+    HashUpdate(&ctx, hash, params->digestSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)view.inputShare, params->stateSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)view.communicatedBits, params->andSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)view.outputShare, params->stateSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, hash, params->digestSizeBytes);
+}
+
+/* This is the random "permuatation" function G for Unruh's transform */
+void G(uint8_t viewNumber, const uint8_t* seed, view_t* view, uint8_t* output, paramset_t* params)
+{
+    HashInstance ctx;
+    uint16_t outputBytes = params->seedSizeBytes + params->andSizeBytes;
+
+    /* Hash the seed with H_5, store digest in output */
+    HashInit(&ctx, params, HASH_PREFIX_5);
+    HashUpdate(&ctx, seed, params->seedSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, output, params->digestSizeBytes);
+
+    /* Hash H_5(seed), the view, and the length */
+    HashInit(&ctx, params, HASH_PREFIX_NONE);
+    HashUpdate(&ctx, output, params->digestSizeBytes);
+    if (viewNumber == 2) {
+        HashUpdate(&ctx, (uint8_t*)view->inputShare, params->stateSizeBytes);
+        outputBytes += (uint16_t)params->stateSizeBytes;
+    }
+    HashUpdate(&ctx, view->communicatedBits, params->andSizeBytes);
+
+    uint16_t outputBytesLE = toLittleEndian(outputBytes);
+    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, output, outputBytes);
+}
+
+void setChallenge(uint8_t* challenge, size_t round, uint8_t trit)
+{
+    /* challenge must have length numBytes(numZKBRounds*2)
+     * 0 <= index < numZKBRounds
+     * trit must be in {0,1,2} */
+    uint32_t roundU32 = (uint32_t)round;
+
+    setBit(challenge, 2 * roundU32, trit & 1);
+    setBit(challenge, 2 * roundU32 + 1, (trit >> 1) & 1);
+}
+
+uint8_t getChallenge(const uint8_t* challenge, size_t round)
+{
+    uint32_t roundU32 = (uint32_t)round;
+
+    return (getBit(challenge, 2 * roundU32 + 1) << 1) | getBit(challenge, 2 * roundU32);
+}
+
+void H3(const uint32_t* circuitOutput, const uint32_t* plaintext, uint32_t** viewOutputs,
+        commitments_t* as,
+        uint8_t* challengeBits, const uint8_t* message, size_t messageByteLength,
+        g_commitments_t* gs, paramset_t* params)
+{
+    uint8_t* hash = malloc(params->digestSizeBytes);
+
+    HashInstance ctx;
+
+    /* Depending on the number of rounds, we might not set part of the last
+     * byte, make sure it's always zero. */
+    challengeBits[numBytes(params->numZKBRounds * 2) - 1] = 0;
+
+    /* Hash input data */
+    HashInit(&ctx, params, HASH_PREFIX_1);
+
+    /* Hash the output share from each view */
+    uint32_t i;
+    int j;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        for (j = 0; j < 3; j++) {
+            HashUpdate(&ctx, (uint8_t*)VIEW_OUTPUTS(i, j), params->stateSizeBytes);
+        }
+    }
+
+    /* Hash all the commitments C */
+    for (i = 0; i < params->numZKBRounds; i++) {
+        for (j = 0; j < 3; j++) {
+            HashUpdate(&ctx, as[i].hashes[j], params->digestSizeBytes);
+        }
+    }
+
+    /* Hash all the commitments G */
+    if (params->transform == TRANSFORM_UR) {
+        for (i = 0; i < params->numZKBRounds; i++) {
+            for (j = 0; j < 3; j++) {
+                size_t view3UnruhLength = (j == 2) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+                HashUpdate(&ctx, gs[i].G[j], view3UnruhLength);
+            }
+        }
+    }
+
+    HashUpdate(&ctx, (uint8_t*)circuitOutput, params->stateSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
+    HashUpdate(&ctx, message, messageByteLength);
+
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, hash, params->digestSizeBytes);
+
+    /* Convert hash to a packed string of values in {0,1,2} */
+    size_t byte_count, round = 0;
+    while (1) {
+        for (byte_count = 0; byte_count < params->digestSizeBytes; byte_count++) {
+            uint8_t byte = hash[byte_count];
+            /* iterate over each pair of bits in the byte */
+            for (j = 0; j < 8; j += 2) {
+                uint8_t bitPair = ((byte >> (6 - j)) & 0x03);
+                if (bitPair < 3) {
+                    setChallenge(challengeBits, round, bitPair);
+                    round++;
+                    if (round == params->numZKBRounds) {
+                        goto done;
+                    }
+                }
+            }
+        }
+
+        /* We need more bits; hash set hash = H_1(hash) */
+        HashInit(&ctx, params, HASH_PREFIX_1);
+        HashUpdate(&ctx, hash, params->digestSizeBytes);
+        HashFinal(&ctx);
+        HashSqueeze(&ctx, hash, params->digestSizeBytes);
+    }
+
+done:
+
+    free(hash);
+    return;
+}
+
+/* Caller must allocate the first parameter */
+void prove(proof_t* proof, uint8_t challenge, seeds_t* seeds,
+           view_t views[3], commitments_t* commitments, g_commitments_t* gs, paramset_t* params)
+{
+    if (challenge == 0) {
+        memcpy(proof->seed1, seeds->seed0, params->seedSizeBytes);
+        memcpy(proof->seed2, seeds->seed1, params->seedSizeBytes);
+    }
+    else if (challenge == 1) {
+        memcpy(proof->seed1, seeds->seed1, params->seedSizeBytes);
+        memcpy(proof->seed2, seeds->seed2, params->seedSizeBytes);
+    }
+    else if (challenge == 2) {
+        memcpy(proof->seed1, seeds->seed2, params->seedSizeBytes);
+        memcpy(proof->seed2, seeds->seed0, params->seedSizeBytes);
+    }
+    else {
+        assert(!"Invalid challenge");
+    }
+
+    if (challenge == 1 || challenge == 2) {
+        memcpy(proof->inputShare, views[2].inputShare, params->stateSizeBytes);
+    }
+    memcpy(proof->communicatedBits, views[(challenge + 1) % 3].communicatedBits, params->andSizeBytes);
+
+    memcpy(proof->view3Commitment, commitments->hashes[(challenge + 2) % 3], params->digestSizeBytes);
+    if (params->transform == TRANSFORM_UR) {
+        size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+        memcpy(proof->view3UnruhG, gs->G[(challenge + 2) % 3], view3UnruhLength);
+    }
+}
+
+void mpc_AND_verify(uint8_t in1[2], uint8_t in2[2], uint8_t out[2],
+                    randomTape_t* rand, view_t* view1, view_t* view2)
+{
+    uint8_t r[2] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos) };
+
+    out[0] = (in1[0] & in2[1]) ^ (in1[1] & in2[0]) ^ (in1[0] & in2[0]) ^ r[0] ^ r[1];
+    setBit(view1->communicatedBits, rand->pos, out[0]);
+    out[1] = getBit(view2->communicatedBits, rand->pos);
+
+    (rand->pos)++;
+}
+
+void mpc_substitution_verify(uint32_t* state[2], randomTape_t* rand, view_t* view1,
+                             view_t* view2, paramset_t* params)
+{
+    uint32_t i;
+    for (i = 0; i < params->numSboxes * 3; i += 3) {
+
+        uint8_t a[2];
+        uint8_t b[2];
+        uint8_t c[2];
+
+        uint8_t j;
+        for (j = 0; j < 2; j++) {
+            a[j] = getBitFromWordArray(state[j], i + 2);
+            b[j] = getBitFromWordArray(state[j], i + 1);
+            c[j] = getBitFromWordArray(state[j], i);
+        }
+
+        uint8_t ab[2];
+        uint8_t bc[2];
+        uint8_t ca[2];
+
+        mpc_AND_verify(a, b, ab, rand, view1, view2);
+        mpc_AND_verify(b, c, bc, rand, view1, view2);
+        mpc_AND_verify(c, a, ca, rand, view1, view2);
+
+        for (j = 0; j < 2; j++) {
+            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
+            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
+            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
+        }
+    }
+}
+
+void mpc_matrix_mul(uint32_t* state[3], const uint32_t* matrix,
+                    uint32_t* output[3], paramset_t* params, size_t players)
+{
+    uint32_t player;
+    for (player = 0; player < players; player++) {
+        matrix_mul(state[player], matrix, output[player], params);
+    }
+}
+
+void mpc_LowMC_verify(view_t* view1, view_t* view2,
+                      randomTape_t* tapes, uint32_t* tmp,
+                      const uint32_t* plaintext, paramset_t* params, uint8_t challenge)
+{
+    uint32_t* state[2];
+    uint32_t* keyShares[2];
+    uint32_t* roundKey[2];
+
+    roundKey[0] = tmp;
+    roundKey[1] = roundKey[0] + params->stateSizeWords;
+    state[0] = roundKey[1] + params->stateSizeWords;
+    state[1] = state[0] + params->stateSizeWords;
+
+    // initialize both roundkeys to 0. they are contingent
+    memset(roundKey[0], 0, 2 * params->stateSizeBytes);
+
+    uint32_t i, r;
+    for (i = 0; i < 2; i++) {
+        memset(state[i], 0x00, params->stateSizeBytes);
+    }
+    mpc_xor_constant_verify(state, plaintext, params->stateSizeWords, challenge);
+
+    keyShares[0] = view1->inputShare;
+    keyShares[1] = view2->inputShare;
+
+    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 2);
+    mpc_xor(state, roundKey, params->stateSizeWords, 2);
+
+    for (r = 1; r <= params->numRounds; ++r) {
+        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 2);
+        mpc_substitution_verify(state, tapes, view1, view2, params);
+        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 2);
+        mpc_xor_constant_verify(state, RConstant(r - 1, params), params->stateSizeWords, challenge);
+        mpc_xor(state, roundKey, params->stateSizeWords, 2);
+    }
+
+    memcpy(view1->outputShare, state[0], params->stateSizeBytes);
+    memcpy(view2->outputShare, state[1], params->stateSizeBytes);
+}
+
+void verifyProof(const proof_t* proof, view_t* view1, view_t* view2,
+                 uint8_t challenge, uint8_t* tmp,
+                 const uint32_t* plaintext, randomTape_t* tape, paramset_t* params)
+{
+    memcpy(view2->communicatedBits, proof->communicatedBits, params->andSizeBytes);
+    tape->pos = 0;
+
+    bool status = false;
+    switch (challenge) {
+    case 0:
+        // in this case, both views' inputs are derivable from the input share
+
+        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
+        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        if (!status) {
+            break;
+        }
+        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
+        break;
+
+    case 1:
+        // in this case view2's input share was already given to us explicitly as
+        // it is not computable from the seed. We just need to compute view1's input from
+        // its seed
+        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
+        status = status && createRandomTape(proof->seed2, tape->tape[1], params->andSizeBytes, params);
+        if (!status) {
+            break;
+        }
+        memcpy(view2->inputShare, proof->inputShare, params->stateSizeBytes);
+        break;
+
+    case 2:
+        // in this case view1's input share was already given to us explicitly as
+        // it is not computable from the seed. We just need to compute view2's input from
+        // its seed
+        status = createRandomTape(proof->seed1, tape->tape[0], params->andSizeBytes, params);
+        memcpy(view1->inputShare, proof->inputShare, params->stateSizeBytes);
+        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        if (!status) {
+            break;
+        }
+        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
+        break;
+
+    default:
+        break;
+    }
+
+    mpc_LowMC_verify(view1, view2, tape, (uint32_t*)tmp, plaintext, params, challenge);
+}
+
+int verify(signature_t* sig, const uint32_t* pubKey, const uint32_t* plaintext,
+           const uint8_t* message, size_t messageByteLength, paramset_t* params)
+{
+    commitments_t* as = allocateCommitments(params);
+    g_commitments_t* gs = allocateGCommitments(params);
+
+    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
+    const proof_t* proofs = sig->proofs;
+
+    const uint8_t* received_challengebits = sig->challengeBits;
+    int status = EXIT_SUCCESS;
+    uint8_t* computed_challengebits = NULL;
+    uint32_t* view3Slab = NULL;
+
+    uint8_t* tmp = malloc(MAX(6 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
+
+    randomTape_t* tape = (randomTape_t*)malloc(sizeof(randomTape_t));
+
+    allocateRandomTape(tape, params);
+
+    view_t* view1s = malloc(params->numZKBRounds * sizeof(view_t));
+    view_t* view2s = malloc(params->numZKBRounds * sizeof(view_t));
+
+    /* Allocate a slab of memory for the 3rd view's output in each round */
+    view3Slab = malloc(params->stateSizeBytes * params->numZKBRounds);
+    uint32_t* view3Output = view3Slab;     /* pointer into the slab to the current 3rd view */
+
+    size_t i, j;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        allocateView(&view1s[i], params);
+        allocateView(&view2s[i], params);
+
+        // last bits of communicatedBits may not be set so zero them
+        view1s[i].communicatedBits[params->andSizeBytes - 1] = 0;
+
+        verifyProof(&proofs[i], &view1s[i], &view2s[i],
+                    getChallenge(received_challengebits, i),
+                    tmp, plaintext, tape, params);
+
+        // create ordered array of commitments with order computed based on the challenge
+        // check commitments of the two opened views
+        uint8_t challenge = getChallenge(received_challengebits, i);
+        Commit(proofs[i].seed1, view1s[i], as[i].hashes[challenge], params);
+        Commit(proofs[i].seed2, view2s[i], as[i].hashes[(challenge + 1) % 3], params);
+        memcpy(as[i].hashes[(challenge + 2) % 3], proofs[i].view3Commitment, params->digestSizeBytes);
+
+        if (params->transform == TRANSFORM_UR) {
+            G(challenge, proofs[i].seed1, &view1s[i], gs[i].G[challenge], params);
+            G((challenge + 1) % 3, proofs[i].seed2, &view2s[i], gs[i].G[(challenge + 1) % 3], params);
+            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+            memcpy(gs[i].G[(challenge + 2) % 3], proofs[i].view3UnruhG, view3UnruhLength);
+        }
+
+        VIEW_OUTPUTS(i, challenge) = view1s[i].outputShare;
+        VIEW_OUTPUTS(i, (challenge + 1) % 3) = view2s[i].outputShare;
+        for (j = 0; j < params->stateSizeWords; j++) {
+            view3Output[j] = view1s[i].outputShare[j] ^ view2s[i].outputShare[j]
+                             ^ pubKey[j];
+        }
+        VIEW_OUTPUTS(i, (challenge + 2) % 3) = view3Output;
+        view3Output += params->stateSizeWords;
+    }
+
+    computed_challengebits = malloc(numBytes(2 * params->numZKBRounds));
+
+    H3(pubKey, plaintext, viewOutputs, as,
+       computed_challengebits, message, messageByteLength, gs, params);
+
+    if (computed_challengebits != NULL &&
+        memcmp(received_challengebits, computed_challengebits,
+               numBytes(2 * params->numZKBRounds)) != 0) {
+        status = EXIT_FAILURE;
+    }
+
+    free(computed_challengebits);
+    free(view3Slab);
+
+    freeCommitments(as);
+    for (i = 0; i < params->numZKBRounds; i++) {
+        freeView(&view1s[i]);
+        freeView(&view2s[i]);
+    }
+    free(view1s);
+    free(view2s);
+    free(tmp);
+    freeRandomTape(tape);
+    free(tape);
+    freeGCommitments(gs);
+    free(viewOutputs);
+
+    return status;
+}
+
+/*** Functions implementing Sign ***/
+
+void mpc_AND(uint8_t in1[3], uint8_t in2[3], uint8_t out[3], randomTape_t* rand,
+             view_t views[3])
+{
+    uint8_t r[3] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos), getBit(rand->tape[2], rand->pos) };
+
+    uint8_t i;
+    for (i = 0; i < 3; i++) {
+        out[i] = (in1[i] & in2[(i + 1) % 3]) ^ (in1[(i + 1) % 3] & in2[i])
+                 ^ (in1[i] & in2[i]) ^ r[i] ^ r[(i + 1) % 3];
+
+        setBit(views[i].communicatedBits, rand->pos, out[i]);
+    }
+
+    (rand->pos)++;
+}
+
+void mpc_substitution(uint32_t* state[3], randomTape_t* rand, view_t views[3],
+                      paramset_t* params)
+{
+    uint8_t a[3];
+    uint8_t b[3];
+    uint8_t c[3];
+
+    uint8_t ab[3];
+    uint8_t bc[3];
+    uint8_t ca[3];
+
+    uint32_t i;
+    for (i = 0; i < params->numSboxes * 3; i += 3) {
+
+        uint8_t j;
+        for (j = 0; j < 3; j++) {
+            a[j] = getBitFromWordArray(state[j], i + 2);
+            b[j] = getBitFromWordArray(state[j], i + 1);
+            c[j] = getBitFromWordArray(state[j], i);
+        }
+
+        mpc_AND(a, b, ab, rand, views);
+        mpc_AND(b, c, bc, rand, views);
+        mpc_AND(c, a, ca, rand, views);
+
+        for (j = 0; j < 3; j++) {
+            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
+            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
+            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
+        }
+    }
+}
+
+void mpc_LowMC(randomTape_t* tapes, view_t views[3],
+               const uint32_t* plaintext, uint32_t* slab, paramset_t* params)
+{
+    uint32_t* keyShares[3];
+    uint32_t* state[3];
+    uint32_t* roundKey[3];
+
+    roundKey[0] = slab;
+    roundKey[1] = slab + params->stateSizeWords;
+    roundKey[2] = roundKey[1] + params->stateSizeWords;
+    state[0] = roundKey[2] + params->stateSizeWords;
+    state[1] = state[0] + params->stateSizeWords;
+    state[2] = state[1] + params->stateSizeWords;
+
+    memset(roundKey[0], 0, 3 * params->stateSizeBytes);
+    int i;
+    for (i = 0; i < 3; i++) {
+        keyShares[i] = views[i].inputShare;
+        memset(state[i], 0x00, params->stateSizeBytes);
+    }
+    mpc_xor_constant(state, plaintext, params->stateSizeWords);
+
+    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 3);
+    mpc_xor(state, roundKey, params->stateSizeWords, 3);
+
+    uint32_t r;
+    for (r = 1; r <= params->numRounds; r++) {
+        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 3);
+        mpc_substitution(state, tapes, views, params);
+        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 3);
+        mpc_xor_constant(state, RConstant(r - 1, params), params->stateSizeWords);
+        mpc_xor(state, roundKey, params->stateSizeWords, 3);
+    }
+
+    for (i = 0; i < 3; i++) {
+        memcpy(views[i].outputShare, state[i], params->stateSizeBytes);
+    }
+
+}
+
+void runMPC(view_t views[3], randomTape_t* rand,
+            uint32_t* plaintext, uint32_t* slab, paramset_t* params)
+{
+    rand->pos = 0;
+    mpc_LowMC(rand, views, plaintext, slab, params);
+}
+
+
+seeds_t* computeSeeds(uint32_t* privateKey, uint32_t*
+                      publicKey, uint32_t* plaintext, const uint8_t* message, size_t messageByteLength, paramset_t* params)
+{
+    HashInstance ctx;
+    seeds_t* allSeeds = allocateSeeds(params);
+
+    HashInit(&ctx, params, HASH_PREFIX_NONE);
+    HashUpdate(&ctx, (uint8_t*)privateKey, params->stateSizeBytes);
+    HashUpdate(&ctx, message, messageByteLength);
+    HashUpdate(&ctx, (uint8_t*)publicKey, params->stateSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
+    uint16_t stateSizeBitsLE = toLittleEndian((uint16_t)params->stateSizeBits);
+    HashUpdate(&ctx, ((uint8_t*)&stateSizeBitsLE), sizeof(uint16_t));
+    HashFinal(&ctx);
+
+    HashSqueeze(&ctx, getSeed(allSeeds, 0, 0), params->seedSizeBytes * 3 * params->numZKBRounds);
+
+    return allSeeds;
+}
+
+int sign(uint32_t* privateKey, uint32_t* pubKey, uint32_t* plaintext, const uint8_t* message,
+         size_t messageByteLength, signature_t* sig, paramset_t* params)
+{
+    bool status;
+
+    /* Allocate views and commitments for all parallel iterations */
+    view_t** views = allocateViews(params);
+    commitments_t* as = allocateCommitments(params);
+    g_commitments_t* gs = allocateGCommitments(params);
+
+    /* Compute seeds for all parallel iterations */
+    seeds_t* seeds = computeSeeds(privateKey, pubKey, plaintext, message, messageByteLength, params);
+
+    //Allocate a random tape (re-used per parallel iteration), and a temporary buffer
+    randomTape_t tape;
+
+    allocateRandomTape(&tape, params);
+    uint8_t* tmp = malloc( MAX(9 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
+
+    uint32_t k;
+    for (k = 0; k < params->numZKBRounds; k++) {
+        // for first two players get all tape INCLUDING INPUT SHARE from seed
+        int j;
+        for (j = 0; j < 2; j++) {
+            status = createRandomTape(getSeed(seeds, k, j), tmp, params->stateSizeBytes + params->andSizeBytes, params);
+            if (!status) {
+                return EXIT_FAILURE;
+            }
+
+            memcpy(views[k][j].inputShare, tmp, params->stateSizeBytes);
+            memcpy(tape.tape[j], tmp + params->stateSizeBytes, params->andSizeBytes);
+        }
+        // Now set third party's wires. The random bits are from the seed, the input is
+        // the XOR of other two inputs and the private key
+        status = createRandomTape(getSeed(seeds, k, 2), tape.tape[2], params->andSizeBytes, params);
+        if (!status) {
+            return EXIT_FAILURE;
+        }
+        uint32_t j1;
+        for (j1 = 0; j1 < params->stateSizeWords; j1++) {
+            views[k][2].inputShare[j1] = privateKey[j1]
+                                        ^ views[k][0].inputShare[j1]
+                                        ^ views[k][1].inputShare[j1];
+        }
+
+        runMPC(views[k], &tape, plaintext, (uint32_t*)tmp, params);
+
+        //Committing
+        Commit(getSeed(seeds, k, 0), views[k][0], as[k].hashes[0], params);
+        Commit(getSeed(seeds, k, 1), views[k][1], as[k].hashes[1], params);
+        Commit(getSeed(seeds, k, 2), views[k][2], as[k].hashes[2], params);
+
+        if (params->transform == TRANSFORM_UR) {
+            G(0, getSeed(seeds, k, 0), &views[k][0], gs[k].G[0], params);
+            G(1, getSeed(seeds, k, 1), &views[k][1], gs[k].G[1], params);
+            G(2, getSeed(seeds, k, 2), &views[k][2], gs[k].G[2], params);
+        }
+    }
+
+    //Generating challenges
+    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
+
+    size_t ii, jj;
+    for (ii = 0; ii < params->numZKBRounds; ii++)
+        for (jj = 0; jj < 3; jj++)
+            VIEW_OUTPUTS(ii, jj) = views[ii][jj].outputShare;
+
+
+    uint32_t output[LOWMC_MAX_STATE_SIZE];
+    uint32_t j;
+    for (j = 0; j < params->stateSizeWords; j++)
+        output[j] = (VIEW_OUTPUTS(0, 0))[j] ^ (VIEW_OUTPUTS(0, 1))[j] ^ (VIEW_OUTPUTS(0, 2))[j];
+
+
+    H3(output, plaintext, viewOutputs, as,
+       sig->challengeBits, message, messageByteLength, gs, params);
+
+    //Packing Z
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        proof_t* proof = &sig->proofs[i];
+        prove(proof, getChallenge(sig->challengeBits, i), &seeds[i],
+              views[i], &as[i], (gs == NULL) ? NULL : &gs[i], params);
+    }
+
+    free(tmp);
+
+    freeViews(views, params);
+    freeCommitments(as);
+    freeRandomTape(&tape);
+    freeGCommitments(gs);
+    free(viewOutputs);
+    freeSeeds(seeds);
+
+    return EXIT_SUCCESS;
+}
+
+/*** Serialization functions ***/
+
+int serializeSignature(const signature_t* sig, uint8_t* sigBytes, size_t sigBytesLen, paramset_t* params)
+{
+    const proof_t* proofs = sig->proofs;
+    const uint8_t* challengeBits = sig->challengeBits;
+
+    /* Validate input buffer is large enough */
+    size_t bytesRequired = numBytes(2 * params->numZKBRounds) +
+                           params->numZKBRounds * (2 * params->seedSizeBytes + params->stateSizeBytes + params->andSizeBytes + params->digestSizeBytes);
+
+    if (params->transform == TRANSFORM_UR) {
+        bytesRequired += params->UnruhGWithoutInputBytes * params->numZKBRounds;
+    }
+
+    if (sigBytesLen < bytesRequired) {
+        return -1;
+    }
+
+    uint8_t* sigBytesBase = sigBytes;
+
+    memcpy(sigBytes, challengeBits, numBytes(2 * params->numZKBRounds));
+    sigBytes += numBytes(2 * params->numZKBRounds);
+
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+
+        uint8_t challenge = getChallenge(challengeBits, i);
+
+        memcpy(sigBytes, proofs[i].view3Commitment, params->digestSizeBytes);
+        sigBytes += params->digestSizeBytes;
+
+        if (params->transform == TRANSFORM_UR) {
+            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+            memcpy(sigBytes, proofs[i].view3UnruhG, view3UnruhLength);
+            sigBytes += view3UnruhLength;
+        }
+
+        memcpy(sigBytes, proofs[i].communicatedBits, params->andSizeBytes);
+        sigBytes += params->andSizeBytes;
+
+        memcpy(sigBytes, proofs[i].seed1, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        memcpy(sigBytes, proofs[i].seed2, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        if (challenge == 1 || challenge == 2) {
+            memcpy(sigBytes, proofs[i].inputShare, params->stateSizeBytes);
+            sigBytes += params->stateSizeBytes;
+        }
+
+
+    }
+
+    return (int)(sigBytes - sigBytesBase);
+}
+
+
+static size_t computeInputShareSize(const uint8_t* challengeBits, size_t stateSizeBytes, paramset_t* params)
+{
+    /* When the FS transform is used, the input share is included in the proof
+     * only when the challenge is 1 or 2.  When dersializing, to compute the
+     * number of bytes expected, we must check how many challenge values are 1
+     * or 2. The parameter stateSizeBytes is the size of an input share. */
+    size_t inputShareSize = 0;
+
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        uint8_t challenge = getChallenge(challengeBits, i);
+        if (challenge == 1 || challenge == 2) {
+            inputShareSize += stateSizeBytes;
+        }
+    }
+    return inputShareSize;
+}
+
+int deserializeSignature(signature_t* sig, const uint8_t* sigBytes,
+                         size_t sigBytesLen, paramset_t* params)
+{
+    proof_t* proofs = sig->proofs;
+    uint8_t* challengeBits = sig->challengeBits;
+
+    /* Validate input buffer is large enough */
+    if (sigBytesLen < numBytes(2 * params->numZKBRounds)) {     /* ensure the input has at least the challenge */
+        return EXIT_FAILURE;
+    }
+    size_t inputShareSize = computeInputShareSize(sigBytes, params->stateSizeBytes, params);
+    size_t bytesExpected = numBytes(2 * params->numZKBRounds) +
+                           params->numZKBRounds * (2 * params->seedSizeBytes + params->andSizeBytes + params->digestSizeBytes) + inputShareSize;
+    if (params->transform == TRANSFORM_UR) {
+        bytesExpected += params->UnruhGWithoutInputBytes * params->numZKBRounds;
+    }
+    if (sigBytesLen < bytesExpected) {
+        return EXIT_FAILURE;
+    }
+
+    memcpy(challengeBits, sigBytes, numBytes(2 * params->numZKBRounds));
+    sigBytes += numBytes(2 * params->numZKBRounds);
+
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+
+        uint8_t challenge = getChallenge(challengeBits, i);
+
+        memcpy(proofs[i].view3Commitment, sigBytes, params->digestSizeBytes);
+        sigBytes += params->digestSizeBytes;
+
+        if (params->transform == TRANSFORM_UR) {
+            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+            memcpy(proofs[i].view3UnruhG, sigBytes, view3UnruhLength);
+            sigBytes += view3UnruhLength;
+        }
+
+        memcpy(proofs[i].communicatedBits, sigBytes, params->andSizeBytes);
+        sigBytes += params->andSizeBytes;
+
+        memcpy(proofs[i].seed1, sigBytes, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        memcpy(proofs[i].seed2, sigBytes, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        if (challenge == 1 || challenge == 2) {
+            memcpy(proofs[i].inputShare, sigBytes, params->stateSizeBytes);
+            sigBytes += params->stateSizeBytes;
+        }
+
+    }
+
+    return EXIT_SUCCESS;
+}
+
+
+
+
diff --git a/dap-sdk/net/client/dap_client_http.c b/dap-sdk/net/client/dap_client_http.c
index 4b958587692ef6bd200c93ab331871e0494b1ccb..e47609b4fb53fe9c30afc377cae9bff0b00daca6 100644
--- a/dap-sdk/net/client/dap_client_http.c
+++ b/dap-sdk/net/client/dap_client_http.c
@@ -22,6 +22,7 @@
 
 #include <unistd.h>
 #include <errno.h>
+#include <fcntl.h>
 
 #include "dap_common.h"
 #include "dap_strfuncs.h"
diff --git a/dap-sdk/net/core/dap_events.c b/dap-sdk/net/core/dap_events.c
index c95918cd84601c1ceebc167a8409aa5f4323f873..80e5ff6277a15f990de08e504fd0ee1a10c2c4c1 100644
--- a/dap-sdk/net/core/dap_events.c
+++ b/dap-sdk/net/core/dap_events.c
@@ -50,6 +50,18 @@
 #include <sys/timerfd.h>
 #endif
 
+
+#include <pthread.h>
+
+#ifdef DAP_OS_BSD
+#include <pthread_np.h>
+#include <sys/event.h>
+#include <err.h>
+#include <fcntl.h>
+typedef cpuset_t cpu_set_t; // Adopt BSD CPU setstructure to POSIX variant
+
+#endif
+
 #if defined(DAP_OS_ANDROID)
 #define NO_POSIX_SHED
 #define NO_TIMER
@@ -62,7 +74,7 @@
 #include <mswsock.h>
 #include <ws2tcpip.h>
 #include <io.h>
-#include <pthread.h>
+
 #endif
 
 #include <utlist.h>
@@ -93,7 +105,11 @@ uint32_t dap_get_cpu_count( )
 #ifndef NO_POSIX_SHED
   cpu_set_t cs;
   CPU_ZERO( &cs );
+#if defined (DAP_OS_ANDROID) 
   sched_getaffinity( 0, sizeof(cs), &cs );
+#else
+  pthread_getaffinity_np(pthread_self(), sizeof(cs), &cs);
+#endif
 
   uint32_t count = 0;
   for ( int i = 0; i < 32; i++ ){
@@ -246,11 +262,11 @@ int dap_events_start( dap_events_t *a_events )
         l_worker->id = i;
         l_worker->events = a_events;
         pthread_rwlock_init(&l_worker->esocket_rwlock,NULL);
-
-#ifdef DAP_EVENTS_CAPS_EPOLL
-        l_worker->epoll_fd = epoll_create( DAP_MAX_EVENTS_COUNT );
         pthread_mutex_init(& l_worker->started_mutex, NULL);
         pthread_cond_init( & l_worker->started_cond, NULL);
+
+#if defined(DAP_EVENTS_CAPS_EPOLL)
+        l_worker->epoll_fd = epoll_create( DAP_MAX_EVENTS_COUNT );
         //log_it(L_DEBUG, "Created event_fd %d for worker %u", l_worker->epoll_fd,i);
 #ifdef DAP_OS_WINDOWS
         if (!l_worker->epoll_fd) {
@@ -265,6 +281,10 @@ int dap_events_start( dap_events_t *a_events )
             DAP_DELETE(l_worker);
             return -1;
         }
+#elif defined(DAP_EVENTS_CAPS_POLL)
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+#else
+#error "Not defined worker init for your platform"
 #endif
         s_workers[i] = l_worker;
         pthread_mutex_lock(&l_worker->started_mutex);
diff --git a/dap-sdk/net/core/dap_events_socket.c b/dap-sdk/net/core/dap_events_socket.c
index b8f5ab9997a637e62bc1c3df9852063ae3c9036e..6d5817df2d4aae9edad35539b3d76831d4760720 100644
--- a/dap-sdk/net/core/dap_events_socket.c
+++ b/dap-sdk/net/core/dap_events_socket.c
@@ -29,20 +29,26 @@
 #include <assert.h>
 #include <errno.h>
 
-#ifndef _WIN32
+#if defined (DAP_OS_LINUX)
 #include <sys/epoll.h>
 #include <sys/types.h>
 #include <sys/select.h>
 #include <unistd.h>
 #include <sys/socket.h>
 #include <arpa/inet.h>
+#elif defined (DAP_OS_BSD)
+#include <sys/types.h>
+#include <sys/select.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
 
-
-#else
+#elif defined (DAP_OS_WINDOWS)
 #include <winsock2.h>
 #include <windows.h>
 #include <mswsock.h>
 #include <io.h>
+
 #endif
 
 
@@ -52,6 +58,14 @@
 #include <sys/resource.h>
 #endif
 
+#ifdef DAP_OS_BSD
+#include <pthread_np.h>
+#include <sys/event.h>
+#include <err.h>
+typedef cpuset_t cpu_set_t; // Adopt BSD CPU setstructure to POSIX variant
+#endif
+
+
 #include <fcntl.h>
 #include <pthread.h>
 
@@ -95,7 +109,6 @@ int dap_events_socket_init( )
 #if defined (DAP_EVENTS_CAPS_QUEUE_MQUEUE)
 #include <sys/time.h>
 #include <sys/resource.h>
-
     struct rlimit l_mqueue_limit;
     l_mqueue_limit.rlim_cur = RLIM_INFINITY;
     l_mqueue_limit.rlim_max = RLIM_INFINITY;
@@ -161,13 +174,17 @@ dap_events_socket_t *dap_events_socket_wrap_no_add( dap_events_t *a_events,
     l_ret->ev_base_flags = EPOLLERR | EPOLLRDHUP | EPOLLHUP;
     #elif defined(DAP_EVENTS_CAPS_POLL)
     l_ret->poll_base_flags = POLLERR | POLLRDHUP | POLLHUP;
+    #elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    l_ret->kqueue_base_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
+    l_ret->kqueue_base_fflags = NOTE_CLOSE | NOTE_CLOSE_WRITE | NOTE_DELETE | NOTE_REVOKE ;
+    l_ret->kqueue_base_filter = EVFILT_VNODE;
     #endif
 
     if ( a_sock!= 0 && a_sock != -1){
         pthread_rwlock_wrlock(&a_events->sockets_rwlock);
         HASH_ADD_INT(a_events->sockets, socket, l_ret);
         pthread_rwlock_unlock(&a_events->sockets_rwlock);
-    }else
+    }else if(s_debug_reactor)
         log_it(L_WARNING, "Be carefull, you've wrapped socket 0 or -1 so it wasn't added to global list. Do it yourself when possible");
 
     //log_it( L_DEBUG,"Dap event socket wrapped around %d sock a_events = %X", a_sock, a_events );
@@ -259,6 +276,10 @@ dap_events_socket_t * s_create_type_pipe(dap_worker_t * a_w, dap_events_socket_c
     l_es->ev_base_flags = EPOLLIN | EPOLLERR | EPOLLRDHUP | EPOLLHUP;
 #elif defined(DAP_EVENTS_CAPS_POLL)
     l_es->poll_base_flags = POLLIN | POLLERR | POLLRDHUP | POLLHUP;
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    l_es->kqueue_base_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
+    l_es->kqueue_base_fflags = NOTE_CLOSE | NOTE_CLOSE_WRITE | NOTE_DELETE | NOTE_REVOKE ;
+    l_es->kqueue_base_filter = EVFILT_VNODE;
 #else
 #error "Not defined s_create_type_pipe for your platform"
 #endif
@@ -325,7 +346,8 @@ dap_events_socket_t * dap_events_socket_create(dap_events_desc_type_t a_type, da
         case DESCRIPTOR_TYPE_SOCKET_LOCAL_LISTENING:
 #ifdef DAP_OS_UNIX
             l_sock_class = AF_LOCAL;
-#elif DAP_OS_WIDNOWS
+#elif defined DAP_OS_WINDOWS
+            l_sock_class = AF_INET;
 #endif
         break;
         default:
@@ -333,17 +355,18 @@ dap_events_socket_t * dap_events_socket_create(dap_events_desc_type_t a_type, da
             return NULL;
     }
 
-#ifdef WIN32
-    SOCKET l_sock;
+#ifdef DAP_OS_WINDOWS
+    SOCKET l_sock = socket(l_sock_class, l_sock_type, IPPROTO_IP);
+    u_long l_socket_flags = 1;
+    if (ioctlsocket((SOCKET)l_sock, (long)FIONBIO, &l_socket_flags))
+        log_it(L_ERROR, "Error ioctl %d", WSAGetLastError());
 #else
-    int l_sock;
-#endif
-    l_sock = socket(l_sock_class, l_sock_type | SOCK_NONBLOCK , 0);
+    int l_sock = socket(l_sock_class, l_sock_type | SOCK_NONBLOCK , 0);
     if (l_sock == INVALID_SOCKET) {
         log_it(L_ERROR, "Socket create error");
         return NULL;
     }
-
+#endif
     dap_events_socket_t * l_es =dap_events_socket_wrap_no_add(dap_events_get_default(),l_sock,a_callbacks);
     if(!l_es){
         log_it(L_CRITICAL,"Can't allocate memory for the new esocket");
@@ -405,9 +428,16 @@ dap_events_socket_t * dap_events_socket_queue_ptr_create_input(dap_events_socket
     l_es->ev_base_flags = EPOLLERR | EPOLLRDHUP | EPOLLHUP;
 #elif defined(DAP_EVENTS_CAPS_POLL)
     l_es->poll_base_flags = POLLERR | POLLRDHUP | POLLHUP;
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    // We don't create descriptor for kqueue at all
+    l_es->fd = -1;
+    l_es->kqueue_base_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
+    l_es->kqueue_base_fflags = NOTE_TRIGGER;
+    l_es->kqueue_base_filter = EVFILT_USER;
 #else
 #error "Not defined s_create_type_pipe for your platform"
 #endif
+
 #ifdef DAP_EVENTS_CAPS_QUEUE_MQUEUE
     l_es->mqd = a_es->mqd;
     char l_mq_name[64];
@@ -434,7 +464,7 @@ dap_events_socket_t * dap_events_socket_queue_ptr_create_input(dap_events_socket
         return NULL;
     }
     assert(l_es->mqd);
-#elif defined (DAP_EVENTS_CAPS_QUEUE_PIPE2)
+#elif defined (DAP_EVENTS_CAPS_QUEUE_PIPE2) || defined (DAP_EVENTS_CAPS_QUEUE_PIPE)
     l_es->fd = a_es->fd2;
 #elif defined DAP_EVENTS_CAPS_MSMQ
     l_es->mqh       = a_es->mqh;
@@ -472,6 +502,9 @@ dap_events_socket_t * dap_events_socket_queue_ptr_create_input(dap_events_socket
         log_it(L_ERROR, "Can't open message queue for queue type, error: 0x%x", hr);
         return NULL;
     }
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    // We don't create descriptor for kqueue at all
+    l_es->fd = l_es->fd2 = -1;
 #else
 #error "Not defined dap_events_socket_queue_ptr_create_input() for this platform"
 #endif
@@ -510,17 +543,25 @@ dap_events_socket_t * s_create_type_queue_ptr(dap_worker_t * a_w, dap_events_soc
     l_es->ev_base_flags = EPOLLIN | EPOLLERR | EPOLLRDHUP | EPOLLHUP;
 #elif defined(DAP_EVENTS_CAPS_POLL)
     l_es->poll_base_flags = POLLIN | POLLERR | POLLRDHUP | POLLHUP;
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    l_es->kqueue_base_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
+    l_es->kqueue_base_fflags = NOTE_TRIGGER;
+    l_es->kqueue_base_filter = EVFILT_USER;
 #else
 #error "Not defined s_create_type_queue_ptr for your platform"
 #endif
 
 
-#ifdef DAP_EVENTS_CAPS_QUEUE_PIPE2
+#if defined(DAP_EVENTS_CAPS_QUEUE_PIPE2) || defined(DAP_EVENTS_CAPS_QUEUE_PIPE)
     int l_pipe[2];
     int l_errno;
     char l_errbuf[128];
     l_errbuf[0]=0;
+#if defined(DAP_EVENTS_CAPS_QUEUE_PIPE2)
     if( pipe2(l_pipe,O_DIRECT | O_NONBLOCK ) < 0 ){
+#elif defined(DAP_EVENTS_CAPS_QUEUE_PIPE)
+    if( pipe(l_pipe) < 0 ){
+#endif
         l_errno = errno;
         strerror_r(l_errno, l_errbuf, sizeof (l_errbuf));
         switch (l_errno) {
@@ -529,10 +570,27 @@ dap_events_socket_t * s_create_type_queue_ptr(dap_worker_t * a_w, dap_events_soc
         }
         DAP_DELETE(l_es);
         return NULL;
-    }//else
+    }
+    //else
      //   log_it(L_DEBUG, "Created one-way unnamed packet pipe %d->%d", l_pipe[0], l_pipe[1]);
     l_es->fd = l_pipe[0];
     l_es->fd2 = l_pipe[1];
+    
+#if defined(DAP_EVENTS_CAPS_QUEUE_PIPE)
+    // If we have no pipe2() we should set nonblock mode via fcntl
+    if (l_es->fd > 0 && l_es->fd2 > 0 ) {
+	int l_flags = fcntl(l_es->fd, F_GETFL, 0);
+	if (l_flags != -1){
+	    l_flags |= O_NONBLOCK);
+	    fcntl(l_es->fd, F_SETFL, l_flags) == 0);
+	}
+	l_flags = fcntl(l_es->fd2, F_GETFL, 0);
+	if (l_flags != -1){
+	    l_flags |= O_NONBLOCK);
+	    fcntl(l_es->fd2, F_SETFL, l_flags) == 0);
+	}
+    }
+#endif
 
 #if !defined (DAP_OS_ANDROID)
     FILE* l_sys_max_pipe_size_fd = fopen("/proc/sys/fs/pipe-max-size", "r");
@@ -676,6 +734,9 @@ dap_events_socket_t * s_create_type_queue_ptr(dap_worker_t * a_w, dap_events_soc
     if (hr != MQ_OK) {
         log_it(L_DEBUG, "Message queue %d NOT purged, possible data corruption, err %d", l_es->mq_num, hr);
     }
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    // We don't create descriptor for kqueue at all
+    l_es->fd = l_es->fd2 =  -1;
 
 #else
 #error "Not implemented s_create_type_queue_ptr() on your platform"
@@ -786,8 +847,13 @@ int dap_events_socket_queue_proc_input_unsafe(dap_events_socket_t * a_esocket)
             }
             for (u_int pad = 0; pad < l_mpvar[1].ulVal; pad += sizeof(void*)) {
                 memcpy(&l_queue_ptr, l_body + pad, sizeof(void*));
-                a_esocket->callbacks.queue_ptr_callback (a_esocket, l_queue_ptr);
+                if(a_esocket->callbacks.queue_ptr_callback)
+            	    a_esocket->callbacks.queue_ptr_callback (a_esocket, l_queue_ptr);
             }
+#elif defined DAP_EVENTS_CAPS_KQUEUE
+	    l_queue_ptr = (void*) a_esocket->kqueue_event_catched->ident;
+	    if(a_esocket->callbacks.queue_ptr_callback)
+		a_esocket->callbacks.queue_ptr_callback (a_esocket, l_queue_ptr);
 #else
 #error "No Queue fetch mechanism implemented on your platform"
 #endif
@@ -798,10 +864,13 @@ int dap_events_socket_queue_proc_input_unsafe(dap_events_socket_t * a_esocket)
                 log_it(L_ERROR, "Queue socket %d received invalid data, error %d", a_esocket->socket, WSAGetLastError());
                 return -1;
             }
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+	    void * l_queue_ptr = (void*) a_esocket->kqueue_event_catched->ident;
+	    size_t l_queue_ptr_size = (size_t) a_esocket->kqueue_event_catched->data;
+            a_esocket->callbacks.queue_callback(a_esocket, l_queue_ptr, l_queue_ptr_size);
 #else
             size_t l_read = read(a_esocket->socket, a_esocket->buf_in, a_esocket->buf_in_size_max );
 #endif
-            a_esocket->callbacks.queue_callback(a_esocket, a_esocket->buf_in, l_read);
         }
     }else{
         log_it(L_ERROR, "Queue socket %d accepted data but callback is NULL ", a_esocket->socket);
@@ -832,6 +901,10 @@ dap_events_socket_t * s_create_type_event(dap_worker_t * a_w, dap_events_socket_
     l_es->ev_base_flags = EPOLLIN | EPOLLERR | EPOLLRDHUP | EPOLLHUP;
 #elif defined(DAP_EVENTS_CAPS_POLL)
     l_es->poll_base_flags = POLLIN | POLLERR | POLLRDHUP | POLLHUP;
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    l_es->kqueue_base_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
+    l_es->kqueue_base_fflags = NOTE_TRIGGER;
+    l_es->kqueue_base_filter = EVFILT_USER;
 #else
 #error "Not defined s_create_type_event for your platform"
 #endif
@@ -893,6 +966,10 @@ dap_events_socket_t * s_create_type_event(dap_worker_t * a_w, dap_events_socket_
         l_es->port = l_addr.sin_port;
         //log_it(L_DEBUG, "Bound to port %d", l_addr.sin_port);
     }
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    l_es->fd2 = l_es->fd = -1;
+#else 
+#error "Not defined s_create_type_event() on your platform"
 #endif
     return l_es;
 }
@@ -960,6 +1037,10 @@ void dap_events_socket_event_proc_input_unsafe(dap_events_socket_t *a_esocket)
             a_esocket->callbacks.event_callback(a_esocket, l_value);
             return;
         }
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+	unsigned int l_value = (unsigned int) a_esocket->kqueue_event_catched->data ;
+	a_esocket->callbacks.event_callback(a_esocket, l_value);
+
 #else
 #error "No Queue fetch mechanism implemented on your platform"
 #endif
@@ -974,8 +1055,6 @@ typedef struct dap_events_socket_buf_item
     void *arg;
 } dap_events_socket_buf_item_t;
 
-int dap_events_socket_queue_ptr_send(dap_events_socket_t * a_es, void* a_arg);
-
 /**
  *  Waits on the socket
  *  return 0: timeout, 1: may send data, -1 error
@@ -1070,7 +1149,24 @@ static void add_ptr_to_buf(dap_events_socket_t * a_es, void* a_arg)
  */
 int dap_events_socket_queue_ptr_send_to_input(dap_events_socket_t * a_es_input, void * a_arg)
 {
-    volatile void * l_arg = a_arg;
+    void * l_arg = a_arg;
+#if defined (DAP_EVENTS_CAPS_KQUEUE)
+    if (a_es_input->pipe_out){
+	int l_ret;
+	struct kevent l_event={0};
+	dap_events_socket_t * l_es = a_es_input->pipe_out;
+	EV_SET(&l_event,(uintptr_t) a_arg, l_es->kqueue_base_filter,l_es->kqueue_base_flags, l_es->kqueue_base_fflags,0, l_es);
+        if(l_es->worker)
+	    l_ret=kevent(l_es->worker->kqueue_fd,&l_event,1,NULL,0,NULL);
+        else if (l_es->proc_thread)
+	    l_ret=kevent(l_es->proc_thread->kqueue_fd,&l_event,1,NULL,0,NULL);
+        return l_ret==1?0 : -1;
+    }else{
+	log_it(L_ERROR,"No pipe_out pointer for queue socket, possible created wrong");
+	return -2;
+    }
+    
+#else    
     /*if (a_es_input->buf_out_size >= sizeof(void*)) {
         if (memcmp(a_es_input->buf_out + a_es_input->buf_out_size - sizeof(void*), a_arg, sizeof(void*))) {
             log_it(L_INFO, "Ptr 0x%x already present in input, drop it", a_arg);
@@ -1079,6 +1175,7 @@ int dap_events_socket_queue_ptr_send_to_input(dap_events_socket_t * a_es_input,
     }*/
     return dap_events_socket_write_unsafe(a_es_input, &l_arg, sizeof(l_arg))
             == sizeof(l_arg) ? 0 : 1;
+#endif
 }
 
 /**
@@ -1146,7 +1243,17 @@ int dap_events_socket_queue_ptr_send( dap_events_socket_t * a_es, void* a_arg)
     } else {
         return 0;
     }
-
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    struct kevent l_event={0};
+    EV_SET(&l_event,(uintptr_t) a_arg, a_es->kqueue_base_filter,a_es->kqueue_base_flags, a_es->kqueue_base_fflags,0, a_es);
+    int l_n;
+    if(a_es->worker)
+        l_n = kevent(a_es->worker->kqueue_fd,&l_event,1,NULL,0,NULL);
+    else if (a_es->proc_thread)
+        l_n = kevent(a_es->proc_thread->kqueue_fd,&l_event,1,NULL,0,NULL);
+    else 
+	l_n = 0;
+    l_ret = l_n==1? sizeof(a_arg) : -1;
 #else
 #error "Not implemented dap_events_socket_queue_ptr_send() for this platform"
 #endif
@@ -1183,13 +1290,24 @@ int dap_events_socket_event_signal( dap_events_socket_t * a_es, uint64_t a_value
         return l_errno;
     else
         return 1;
-#elif defined DAP_OS_WINDOWS
+#elif defined (DAP_OS_WINDOWS)
     a_es->buf_out[0] = (u_short)a_value;
     if(dap_sendto(a_es->socket, a_es->port, a_es->buf_out, sizeof(uint64_t)) == SOCKET_ERROR) {
         return WSAGetLastError();
     } else {
         return 0;
     }
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    struct kevent l_event={0};
+    EV_SET(&l_event,0, a_es->kqueue_base_filter,a_es->kqueue_base_flags, a_es->kqueue_base_fflags,a_value, a_es);
+    int l_n;
+    if(a_es->worker)
+        l_n = kevent(a_es->worker->kqueue_fd,&l_event,1,NULL,0,NULL);
+    else if (a_es->proc_thread)
+        l_n = kevent(a_es->proc_thread->kqueue_fd,&l_event,1,NULL,0,NULL);
+    else
+	l_n = 0;
+    return l_n==1?0:-1;
 #else
 #error "Not implemented dap_events_socket_event_signal() for this platform"
 #endif
@@ -1308,6 +1426,33 @@ void dap_events_socket_worker_poll_update_unsafe(dap_events_socket_t * a_esocket
                        a_esocket->worker->poll_count);
             }
         }
+    #elif defined (DAP_EVENTS_CAPS_KQUEUE)
+	if (a_esocket->socket != -1 ){ // Not everything we add in poll
+	    struct kevent * l_event = &a_esocket->kqueue_event;
+	    short l_filter  =a_esocket->kqueue_base_filter;
+	    u_short l_flags =a_esocket->kqueue_base_flags;
+	    u_int l_fflags =a_esocket->kqueue_base_fflags;
+	    
+
+            // Check & add
+	    if( a_esocket->flags & DAP_SOCK_READY_TO_READ )
+		l_filter |= EVFILT_READ;
+	    if( a_esocket->flags & DAP_SOCK_READY_TO_WRITE || a_esocket->flags &DAP_SOCK_CONNECTING )
+		l_filter |= EVFILT_WRITE;
+	    
+	    EV_SET(l_event, a_esocket->socket, l_filter,l_flags,l_fflags,a_esocket->kqueue_data,a_esocket);
+	    if( a_esocket->worker){
+		if ( kevent(a_esocket->worker->kqueue_fd,l_event,1,NULL,0,NULL)!=1 ){
+		    int l_errno = errno;
+		    char l_errbuf[128];
+		    l_errbuf[0]=0;
+		    strerror_r(l_errno, l_errbuf, sizeof (l_errbuf));
+		    log_it(L_ERROR,"Can't update client socket state on kqueue fd %d: \"%s\" (%d)",
+			    a_esocket->worker->kqueue_fd, l_errbuf, l_errno);
+        	}
+    	    }
+        }
+    
     #else
     #error "Not defined dap_events_socket_set_writable_unsafe for your platform"
     #endif
@@ -1418,12 +1563,10 @@ void dap_events_socket_delete_unsafe( dap_events_socket_t * a_esocket , bool a_p
         closesocket( a_esocket->socket );
 #else
         if ( a_esocket->socket && (a_esocket->socket != -1)) {
-        close( a_esocket->socket );
-#ifdef DAP_EVENTS_CAPS_QUEUE_PIPE2
-        if( a_esocket->type == DESCRIPTOR_TYPE_QUEUE){
+    	    close( a_esocket->socket );
+        if( a_esocket->fd2 > 0 ){
             close( a_esocket->fd2);
         }
-#endif
 
 #endif
     }
@@ -1440,7 +1583,7 @@ void dap_events_socket_remove_from_worker_unsafe( dap_events_socket_t *a_es, dap
         log_it(L_INFO, "No worker assigned to esocket %d", a_es->socket);
         return;
     }
-#ifdef DAP_EVENTS_CAPS_EPOLL
+#if defined(DAP_EVENTS_CAPS_EPOLL)
 
     if ( epoll_ctl( a_worker->epoll_fd, EPOLL_CTL_DEL, a_es->socket, &a_es->ev) == -1 ) {
         int l_errno = errno;
@@ -1450,6 +1593,19 @@ void dap_events_socket_remove_from_worker_unsafe( dap_events_socket_t *a_es, dap
                 a_worker->epoll_fd, l_errbuf, l_errno);
     } //else
       //  log_it( L_DEBUG,"Removed epoll's event from dap_worker #%u", a_worker->id );
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    if (a_es->socket != -1 ){
+	struct kevent * l_event = &a_es->kqueue_event;
+        EV_SET(l_event, a_es->socket, 0 ,EV_DELETE, 0,0,a_es);
+
+        if ( kevent( a_worker->kqueue_fd,l_event,1,NULL,0,NULL) != 1 ) {
+            int l_errno = errno;
+            char l_errbuf[128];
+            strerror_r(l_errno, l_errbuf, sizeof (l_errbuf));
+            log_it( L_ERROR,"Can't remove event socket's handler %d from the epoll_fd %d  \"%s\" (%d)", a_es->socket,
+                a_worker->kqueue_fd, l_errbuf, l_errno);
+        } 
+    }
 #elif defined (DAP_EVENTS_CAPS_POLL)
     if (a_es->poll_index < a_worker->poll_count ){
         a_worker->poll[a_es->poll_index].fd = -1;
diff --git a/dap-sdk/net/core/dap_proc_thread.c b/dap-sdk/net/core/dap_proc_thread.c
index 3c64d5bc97c178e90d9e80813fd8a17a3e18fa75..078037817078144dcd6ed19b6deecd5ab085ca57 100644
--- a/dap-sdk/net/core/dap_proc_thread.c
+++ b/dap-sdk/net/core/dap_proc_thread.c
@@ -31,6 +31,11 @@
 #include "wepoll.h"
 #elif defined (DAP_EVENTS_CAPS_POLL)
 #include <poll.h>
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+#include <pthread_np.h>
+#include <sys/event.h>
+#include <err.h>
+typedef cpuset_t cpu_set_t; // Adopt BSD CPU setstructure to POSIX variant
 #else
 #error "Unimplemented poll for this platform"
 #endif
@@ -173,6 +178,55 @@ static void s_proc_event_callback(dap_events_socket_t * a_esocket, uint64_t a_va
 //    log_it(L_DEBUG, "<-- Proc event callback end");
 }
 
+
+/**
+ * @brief dap_proc_thread_assign_esocket_unsafe
+ * @param a_thread
+ * @param a_esocket
+ * @return
+ */
+int dap_proc_thread_assign_esocket_unsafe(dap_proc_thread_t * a_thread, dap_events_socket_t * a_esocket)
+{
+#ifdef DAP_EVENTS_CAPS_EPOLL
+        // Init events for EPOLL
+        a_esocket->ev.events = a_esocket->ev_base_flags ;
+        if(a_esocket->flags & DAP_SOCK_READY_TO_READ )
+            a_esocket->ev.events |= EPOLLIN;
+        if(a_esocket->flags & DAP_SOCK_READY_TO_WRITE )
+            a_esocket->ev.events |= EPOLLOUT;
+        a_esocket->ev.data.ptr = a_esocket;
+        return epoll_ctl(a_worker->epoll_fd, EPOLL_CTL_ADD, a_esocket->socket, &a_esocket->ev);
+#elif defined (DAP_EVENTS_CAPS_POLL)
+    if (  a_thread->poll_count == a_thread->poll_count_max ){ // realloc
+        a_thread->poll_count_max *= 2;
+        log_it(L_WARNING, "Too many descriptors (%u), resizing array twice to %u", a_thread->poll_count, a_thread->poll_count_max);
+        a_thread->poll =DAP_REALLOC(a_thread->poll, a_thread->poll_count_max * sizeof(*a_thread->poll));
+        a_thread->esockets =DAP_REALLOC(a_thread->esockets, a_thread->poll_count_max * sizeof(*a_thread->esockets));
+    }
+
+    a_thread->poll[a_thread->poll_count].fd = a_thread->proc_queue->esocket->fd;
+    a_thread->poll[a_thread->poll_count].events = a_thread->proc_queue->esocket->poll_base_flags;
+    a_thread->esockets[a_thread->poll_count] = a_thread->proc_queue->esocket;
+    a_thread->poll_count++;
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    u_short l_flags = a_esocket->kqueue_base_flags;
+    u_int   l_fflags = a_esocket->kqueue_base_fflags;
+    short l_filter = a_esocket->kqueue_base_filter;
+        if(a_esocket->flags & DAP_SOCK_READY_TO_READ )
+            l_fflags |= NOTE_READ;
+        if(a_esocket->flags & DAP_SOCK_READY_TO_WRITE )
+            l_fflags |= NOTE_WRITE;
+
+        EV_SET(&a_esocket->kqueue_event , a_esocket->socket, l_filter, EV_ADD| l_flags | EV_CLEAR, l_fflags,0, a_esocket);
+        return kevent ( a_thread->kqueue_fd,&a_esocket->kqueue_event,1,NULL,0,NULL)==1 ? 0 : -1 ;
+
+#else
+#error "Unimplemented new esocket on worker callback for current platform"
+#endif
+
+    return dap_proc_thread_esocket_update_poll_flags(a_thread,a_esocket);
+}
+
 /**
  * @brief dap_proc_thread_esocket_update_poll_flags
  * @param a_thread
@@ -204,11 +258,33 @@ int dap_proc_thread_esocket_update_poll_flags(dap_proc_thread_t * a_thread, dap_
         return -1;
     }
 #elif defined (DAP_EVENTS_CAPS_POLL)
+    if (  a_thread->poll_count == a_thread->poll_count_max ){ // realloc
+        a_thread->poll_count_max *= 2;
+        log_it(L_WARNING, "Too many descriptors (%u), resizing array twice to %u", a_thread->poll_count, a_thread->poll_count_max);
+        a_thread->poll =DAP_REALLOC(a_thread->poll, a_thread->poll_count_max * sizeof(*a_thread->poll));
+        a_thread->esockets =DAP_REALLOC(a_thread->esockets, a_thread->poll_count_max * sizeof(*a_thread->esockets));
+    }
     a_thread->poll[a_esocket->poll_index].events= a_esocket->poll_base_flags;
     if( a_esocket->flags & DAP_SOCK_READY_TO_READ)
         a_thread->poll[a_esocket->poll_index].revents |= POLLIN;
     if( a_esocket->flags & DAP_SOCK_READY_TO_WRITE)
         a_thread->poll[a_esocket->poll_index].revents |= POLLOUT;
+        
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    u_short l_flags = a_esocket->kqueue_base_flags;
+    u_int   l_fflags = a_esocket->kqueue_base_fflags;
+    short l_filter = a_esocket->kqueue_base_filter;
+    if(a_esocket->flags & DAP_SOCK_READY_TO_READ )
+	l_fflags |= NOTE_READ;
+    if(a_esocket->flags & DAP_SOCK_READY_TO_WRITE )
+    l_fflags |= NOTE_WRITE;
+    EV_SET(&a_esocket->kqueue_event , a_esocket->socket, l_filter, EV_ADD| l_flags | EV_CLEAR, l_fflags,0, a_esocket);
+    
+    if( kevent ( a_thread->kqueue_fd,&a_esocket->kqueue_event,1,NULL,0,NULL)!=1 ){
+        log_it(L_CRITICAL, "Can't add descriptor in proc thread kqueue , err: %d", errno);
+        return -1;
+    }
+        
 #else
 #error "Not defined dap_proc_thread.c::s_update_poll_flags() on your platform"
 #endif
@@ -228,25 +304,7 @@ dap_events_socket_t * dap_proc_thread_create_queue_ptr(dap_proc_thread_t * a_thr
     if(l_es == NULL)
         return NULL;
     l_es->proc_thread = a_thread;
-#ifdef DAP_EVENTS_CAPS_EPOLL
-    l_es->ev.events      = l_es->ev_base_flags ;
-    l_es->ev.data.ptr    = l_es;
-    if( epoll_ctl(a_thread->epoll_ctl, EPOLL_CTL_ADD, l_es->socket, &l_es->ev) != 0 ){
-#ifdef DAP_OS_WINDOWS
-    errno = WSAGetLastError();
-#endif
-        log_it(L_CRITICAL, "Can't add queue input on epoll ctl, err: %d", errno);
-        return NULL;
-    }
-#elif defined(DAP_EVENTS_CAPS_POLL)
-    l_es->poll_index = a_thread->poll_count;
-    a_thread->poll[a_thread->poll_count].fd = l_es->fd;
-    a_thread->poll[a_thread->poll_count].events = l_es->poll_base_flags;
-    a_thread->esockets[a_thread->poll_count] = l_es;
-    a_thread->poll_count++;
-#else
-#error "Not defined dap_proc_thread_create_queue_ptr() on your platform"
-#endif
+    dap_proc_thread_assign_esocket_unsafe (a_thread, l_es);
     return l_es;
 }
 
@@ -261,13 +319,18 @@ static void * s_proc_thread_function(void * a_arg)
     dap_proc_thread_t * l_thread = (dap_proc_thread_t*) a_arg;
     assert(l_thread);
     dap_cpu_assign_thread_on(l_thread->cpu_id);
+    
     struct sched_param l_shed_params;
     l_shed_params.sched_priority = 0;
-#ifdef DAP_OS_WINDOWS 
+#if defined(DAP_OS_WINDOWS)
 	if (!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST))
         log_it(L_ERROR, "Couldn't set thread priority, err: %d", GetLastError());
-#else
+#elif defined (DAP_OS_LINUX)
     pthread_setschedparam(pthread_self(),SCHED_BATCH ,&l_shed_params);
+#elif defined (DAP_OS_BSD)
+    pthread_setschedparam(pthread_self(),SCHED_OTHER ,&l_shed_params);
+#else
+#error "Undefined set sched param"
 #endif
     l_thread->proc_queue = dap_proc_queue_create(l_thread);
 
@@ -365,10 +428,7 @@ static void * s_proc_thread_function(void * a_arg)
     l_thread->esockets = DAP_NEW_Z_SIZE(dap_events_socket_t*,l_thread->poll_count_max *sizeof (*l_thread->esockets));
 
     // Add proc queue
-    l_thread->poll[l_thread->poll_count].fd = l_thread->proc_queue->esocket->fd;
-    l_thread->poll[l_thread->poll_count].events = l_thread->proc_queue->esocket->poll_base_flags;
-    l_thread->esockets[l_thread->poll_count] = l_thread->proc_queue->esocket;
-    l_thread->poll_count++;
+    dap_proc_thread_assign_esocket_unsafe(l_thread,l_thread->proc_queue->esocket);
 
     // Add proc event
     l_thread->poll[l_thread->poll_count].fd = l_thread->proc_event->fd;
@@ -404,6 +464,25 @@ static void * s_proc_thread_function(void * a_arg)
             l_thread->poll_count++;
         }
     }
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    // Create kqueue fd
+    l_thread->kqueue_fd = kqueue();
+    l_thread->kqueue_events_count_max = DAP_EVENTS_SOCKET_MAX;
+    l_thread->kqueue_events = DAP_NEW_Z_SIZE(struct kevent, l_thread->kqueue_events_count_max *sizeof(struct kevent));
+
+    dap_proc_thread_assign_esocket_unsafe(l_thread,l_thread->proc_queue->esocket);
+    dap_proc_thread_assign_esocket_unsafe(l_thread,l_thread->proc_event);
+
+    for (size_t n = 0; n< dap_events_worker_get_count(); n++){
+        // Queue asssign
+	dap_proc_thread_assign_esocket_unsafe(l_thread, l_thread->queue_assign_input[n]);
+
+        // Queue IO
+	dap_proc_thread_assign_esocket_unsafe(l_thread, l_thread->queue_io_input[n]);
+
+        // Queue callback
+	dap_proc_thread_assign_esocket_unsafe(l_thread, l_thread->queue_callback_input[n]);
+    }
 
 #else
 #error "Unimplemented poll events analog for this platform"
@@ -416,13 +495,18 @@ static void * s_proc_thread_function(void * a_arg)
     // Main loop
     while (! l_thread->signal_kill){
 
+        int l_selected_sockets;
+        size_t l_sockets_max;
 #ifdef DAP_EVENTS_CAPS_EPOLL
         //log_it(L_DEBUG, "Epoll_wait call");
-        int l_selected_sockets = epoll_wait(l_thread->epoll_ctl, l_epoll_events, DAP_EVENTS_SOCKET_MAX, -1);
-        size_t l_sockets_max = (size_t)l_selected_sockets;
+        l_selected_sockets = epoll_wait(l_thread->epoll_ctl, l_epoll_events, DAP_EVENTS_SOCKET_MAX, -1);
+        l_sockets_max = (size_t)l_selected_sockets;
 #elif defined (DAP_EVENTS_CAPS_POLL)
-        int l_selected_sockets = poll(l_thread->poll,l_thread->poll_count,-1);
-        size_t l_sockets_max = l_thread->poll_count;
+        l_selected_sockets = poll(l_thread->poll,l_thread->poll_count,-1);
+        l_sockets_max = l_thread->poll_count;
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+        l_selected_sockets = kevent(l_thread->kqueue_fd,NULL,0,l_thread->kqueue_events,l_thread->kqueue_events_count_max,NULL);
+        l_sockets_max = l_thread->kqueue_events_count_max;
 #else
 #error "Unimplemented poll wait analog for this platform"
 #endif
@@ -472,6 +556,14 @@ static void * s_proc_thread_function(void * a_arg)
             l_flag_nval = l_cur_events & POLLNVAL;
             l_flag_pri = l_cur_events & POLLPRI;
             l_flag_msg = l_cur_events & POLLMSG;
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+	    l_cur = (dap_events_socket_t*) l_thread->kqueue_events[n].udata;
+            if (!l_cur)
+        	continue;
+            l_cur->kqueue_event_catched = &l_thread->kqueue_events[n];
+	    u_int l_cur_events = l_thread->kqueue_events[n].fflags;
+            l_flag_write = l_cur_events & EVFILT_WRITE;
+            l_flag_read  = l_cur_events & EVFILT_READ;
 #else
 #error "Unimplemented fetch esocket after poll"
 #endif
@@ -578,6 +670,19 @@ static void * s_proc_thread_function(void * a_arg)
                                         l_errno = errno;
                                         log_it(L_WARNING,"mq_send %p errno: %d", l_ptr_in, l_errno);
                                     }
+                                #elif defined (DAP_EVENTS_CAPS_KQUEUE)
+				    struct kevent* l_event=&l_cur->kqueue_event;
+				    void * l_ptr;
+				    memcpy(l_ptr,l_cur->buf_out,sizeof(l_ptr) );
+				    
+				    EV_SET(l_event,(uintptr_t) l_ptr, l_cur->kqueue_base_filter,l_cur->kqueue_base_flags, l_cur->kqueue_base_fflags,0, l_cur);
+				    int l_n = kevent(l_thread->kqueue_fd,l_event,1,NULL,0,NULL);
+				    if (l_n == 1)
+					l_bytes_sent = sizeof(l_ptr);
+				    else{
+					l_errno = errno;
+                                        log_it(L_WARNING,"queue ptr send error: kevent %p errno: %d", l_ptr, l_errno);
+				    }
                                 #else
                                     #error "Not implemented dap_events_socket_queue_ptr_send() for this platform"
                                 #endif
@@ -626,6 +731,19 @@ static void * s_proc_thread_function(void * a_arg)
 #elif defined (DAP_EVENTS_CAPS_POLL)
                 l_thread->poll[n].fd = -1;
                 l_poll_compress = true;
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+		if (l_cur->socket != -1 ){
+		    struct kevent * l_event = &l_cur->kqueue_event;
+		    EV_SET(l_event, l_cur->socket, 0 ,EV_DELETE, 0,0,l_cur);
+		    if ( kevent( l_thread->kqueue_fd,l_event,1,NULL,0,NULL) != 1 ) {
+			int l_errno = errno;
+			char l_errbuf[128];
+			strerror_r(l_errno, l_errbuf, sizeof (l_errbuf));
+			log_it( L_ERROR,"Can't remove event socket's handler %d from the epoll_fd %d  \"%s\" (%d)", l_cur->socket,
+				l_thread->kqueue_fd, l_errbuf, l_errno);
+    		    }
+		}
+
 #else
 #error "Unimplemented poll ctl analog for this platform"
 #endif
diff --git a/dap-sdk/net/core/dap_server.c b/dap-sdk/net/core/dap_server.c
index 8b94ed97b0106018afdb52b4b24dffe5da5c3063..40c578dc10a14ef20fea34200459a702f5b0fb1a 100644
--- a/dap-sdk/net/core/dap_server.c
+++ b/dap-sdk/net/core/dap_server.c
@@ -21,10 +21,11 @@
     along with any DAP SDK based project.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#ifdef DAP_OS_WINDOWS
+#if defined(DAP_OS_WINDOWS)
 #include "wepoll.h"
 #include <ws2tcpip.h>
-#elif defined DAP_OS_UNIX
+
+#elif defined(DAP_OS_LINUX)
 #include <arpa/inet.h>
 #include <netinet/in.h>
 #include <sys/socket.h>
@@ -32,6 +33,13 @@
 #include <netdb.h>
 #include <sys/timerfd.h>
 #include <sys/un.h>
+#elif defined (DAP_OS_BSD)
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/event.h>
+#include <sys/un.h>
+#include <netdb.h>
 #endif
 
 #include <sys/types.h>
diff --git a/dap-sdk/net/core/dap_timerfd.c b/dap-sdk/net/core/dap_timerfd.c
index 1bbc56e3c1b4bdfc9343be0923140c61fd06b6e7..f92a4af3a8c57a9b8d84f8f6e76f1a2f7d0c2308 100644
--- a/dap-sdk/net/core/dap_timerfd.c
+++ b/dap-sdk/net/core/dap_timerfd.c
@@ -1,8 +1,9 @@
 /*
  * Authors:
  * Alexander Lysikov <alexander.lysikov@demlabs.net>
+ * Dmitriy Gerasimov <dmitriy.gerasimov@demlabs.net>
  * DeM Labs Ltd.   https://demlabs.net
- * Copyright  (c) 2020
+ * Copyright  (c) 2021
  * All rights reserved.
 
  This file is part of DAP SDK the open source project
@@ -114,7 +115,24 @@ dap_timerfd_t* dap_timerfd_start_on_proc_thread(dap_proc_thread_t * a_proc_threa
 dap_timerfd_t* dap_timerfd_create(uint64_t a_timeout_ms, dap_timerfd_callback_t a_callback, void *a_callback_arg)
 {
     dap_timerfd_t *l_timerfd = DAP_NEW(dap_timerfd_t);
-#if defined DAP_OS_UNIX
+    // create events_socket for timer file descriptor
+    dap_events_socket_callbacks_t l_s_callbacks;
+    memset(&l_s_callbacks,0,sizeof (l_s_callbacks));
+    l_s_callbacks.timer_callback = s_es_callback_timer;
+
+    dap_events_socket_t * l_events_socket = dap_events_socket_wrap_no_add(dap_events_get_default(), -1, &l_s_callbacks);
+    l_events_socket->type = DESCRIPTOR_TYPE_TIMER;
+
+    // pass l_timerfd to events_socket
+    l_events_socket->_inheritor = l_timerfd;
+
+    // fill out dap_timerfd_t structure
+    l_timerfd->timeout_ms       = a_timeout_ms;
+    l_timerfd->callback         = a_callback;
+    l_timerfd->callback_arg     = a_callback_arg;
+    l_timerfd->events_socket    = l_events_socket;
+    
+#if defined DAP_OS_LINUX
     struct itimerspec l_ts;
     int l_tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
     if(l_tfd == -1) {
@@ -134,7 +152,14 @@ dap_timerfd_t* dap_timerfd_create(uint64_t a_timeout_ms, dap_timerfd_callback_t
         DAP_DELETE(l_timerfd);
         return NULL;
     }
-#elif defined DAP_OS_WINDOWS
+    l_events_socket->socket = l_tfd;
+#elif defined (DAP_OS_BSD)
+    l_events_socket->kqueue_base_flags = EV_ADD | EV_ONESHOT | EV_DISPATCH;
+    l_events_socket->kqueue_base_filter = EVFILT_TIMER;
+    l_events_socket->kqueue_base_fflags = NOTE_MSECONDS;
+    l_events_socket->kqueue_data =(int64_t) a_timeout_ms;
+    l_events_socket->socket = rand();
+#elif defined (DAP_OS_WINDOWS)
     HANDLE l_th = CreateWaitableTimer(NULL, true, NULL);
     if (!l_th) {
         log_it(L_CRITICAL, "Waitable timer not created, error %d", GetLastError());
@@ -174,24 +199,12 @@ dap_timerfd_t* dap_timerfd_create(uint64_t a_timeout_ms, dap_timerfd_callback_t
         DAP_DELETE(l_timerfd);
         return NULL;
     }
+    l_events_socket->socket = l_tfd;
 #endif
-
-    // create events_socket for timer file descriptor
-    dap_events_socket_callbacks_t l_s_callbacks;
-    memset(&l_s_callbacks,0,sizeof (l_s_callbacks));
-    l_s_callbacks.timer_callback = s_es_callback_timer;
-
-    dap_events_socket_t * l_events_socket = dap_events_socket_wrap_no_add(dap_events_get_default(), l_tfd, &l_s_callbacks);
-    l_events_socket->type = DESCRIPTOR_TYPE_TIMER;
-    // pass l_timerfd to events_socket
-    l_events_socket->_inheritor = l_timerfd;
-
-    // fill out dap_timerfd_t structure
-    l_timerfd->timeout_ms       = a_timeout_ms;
+    
+#if defined (DAP_OS_LINUX) || defined (DAP_OS_WINDOWS)    
     l_timerfd->tfd              = l_tfd;
-    l_timerfd->events_socket    = l_events_socket;
-    l_timerfd->callback         = a_callback;
-    l_timerfd->callback_arg     = a_callback_arg;
+#endif
 #ifdef DAP_OS_WINDOWS
     l_timerfd->th               = l_th;
 #endif
@@ -208,7 +221,7 @@ static void s_es_callback_timer(struct dap_events_socket *a_event_sock)
     // run user's callback
     if(l_timerfd->callback && l_timerfd->callback(l_timerfd->callback_arg)) {
         //printf("\nread() returned %d, %d\n", l_ptiu64, l_read_ret);
-#if defined DAP_OS_UNIX
+#if defined DAP_OS_LINUX
         struct itimerspec l_ts;
         // repeat never
         l_ts.it_interval.tv_sec = 0;
@@ -219,20 +232,26 @@ static void s_es_callback_timer(struct dap_events_socket *a_event_sock)
         if(timerfd_settime(l_timerfd->tfd, 0, &l_ts, NULL) < 0) {
             log_it(L_WARNING, "callback_timerfd_read() failed: timerfd_settime() errno=%d\n", errno);
         }
-#elif defined DAP_OS_WINDOWS
+#elif defined (DAP_OS_BSD)
+	struct kevent * l_event = &a_event_sock->kqueue_event;
+	EV_SET(l_event, 0, a_event_sock->kqueue_base_filter, a_event_sock->kqueue_base_flags,a_event_sock->kqueue_base_fflags,a_event_sock->kqueue_data,a_event_sock);
+	kevent(a_event_sock->worker->kqueue_fd,l_event,1,NULL,0,NULL);
+#elif defined (DAP_OS_WINDOWS)
         LARGE_INTEGER l_due_time;
         l_due_time.QuadPart = (long long)l_timerfd->timeout_ms * _MSEC;
         if (!SetWaitableTimer(l_timerfd->th, &l_due_time, 0, TimerAPCb, l_timerfd, false)) {
             log_it(L_CRITICAL, "Waitable timer not reset, error %d", GetLastError());
             CloseHandle(l_timerfd->th);
         }
+#else
+#error "No timer callback realization for your platform"        
 #endif
         dap_events_socket_set_readable_unsafe(a_event_sock, true);
     } else {
-#ifndef DAP_OS_WINDOWS
+#if defined(DAP_OS_LINUX)
         close(l_timerfd->tfd);
-#else
-        closesocket(l_timerfd->tfd);
+#elif defined(DAP_OS_WINDOWS)
+    	closesocket(l_timerfd->tfd);
         CloseHandle(l_timerfd->th);
 #endif
         l_timerfd->events_socket->flags |= DAP_SOCK_SIGNAL_CLOSE;
diff --git a/dap-sdk/net/core/dap_worker.c b/dap-sdk/net/core/dap_worker.c
index 81e769d424d3468562e99c72d82e4703d6cb2691..298e86923143140a612ba143c5ddb7e070496670 100644
--- a/dap-sdk/net/core/dap_worker.c
+++ b/dap-sdk/net/core/dap_worker.c
@@ -101,6 +101,7 @@ void *dap_worker_thread(void *arg)
     uint32_t l_tn = l_worker->id;
 
     dap_cpu_assign_thread_on(l_worker->id);
+    pthread_setspecific(l_worker->events->pth_key_worker, l_worker);
     struct sched_param l_shed_params;
     l_shed_params.sched_priority = 0;
 #ifdef DAP_OS_WINDOWS
@@ -113,6 +114,18 @@ void *dap_worker_thread(void *arg)
 #ifdef DAP_EVENTS_CAPS_EPOLL
     struct epoll_event l_epoll_events[ DAP_EVENTS_SOCKET_MAX]= {{0}};
     log_it(L_INFO, "Worker #%d started with epoll fd %d and assigned to dedicated CPU unit", l_worker->id, l_worker->epoll_fd);
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+    l_worker->kqueue_fd = kqueue();
+    if (l_worker->kqueue_fd == -1 ){
+	int l_errno = errno;
+	char l_errbuf[255];
+	strerror_r(l_errno,l_errbuf,sizeof(l_errbuf));
+	log_it (L_CRITICAL,"Can't create kqueue():\"\" code %d",l_errbuf,l_errno);
+        pthread_cond_broadcast(&l_worker->started_cond);
+	return NULL;
+    }
+    l_worker->kqueue_events_count_max = DAP_EVENTS_SOCKET_MAX;
+    l_worker->kqueue_events = DAP_NEW_Z_SIZE(struct kevent, l_worker->kqueue_events_count_max *sizeof(struct kevent));
 #elif defined(DAP_EVENTS_CAPS_POLL)
     l_worker->poll_count_max = DAP_EVENTS_SOCKET_MAX;
     l_worker->poll = DAP_NEW_Z_SIZE(struct pollfd,l_worker->poll_count_max*sizeof (struct pollfd));
@@ -136,17 +149,20 @@ void *dap_worker_thread(void *arg)
     
     l_worker->timer_check_activity = dap_timerfd_start_on_worker( l_worker, s_connection_timeout * 1000 / 2,
                                                                   s_socket_all_check_activity, l_worker);
-
-    pthread_setspecific(l_worker->events->pth_key_worker, l_worker);
     pthread_cond_broadcast(&l_worker->started_cond);
     bool s_loop_is_active = true;
     while(s_loop_is_active) {
+	int l_selected_sockets;
+	size_t l_sockets_max;
 #ifdef DAP_EVENTS_CAPS_EPOLL
-        int l_selected_sockets = epoll_wait(l_worker->epoll_fd, l_epoll_events, DAP_EVENTS_SOCKET_MAX, -1);
-        size_t l_sockets_max = l_selected_sockets;
+        l_selected_sockets = epoll_wait(l_worker->epoll_fd, l_epoll_events, DAP_EVENTS_SOCKET_MAX, -1);
+        l_sockets_max = l_selected_sockets;
 #elif defined(DAP_EVENTS_CAPS_POLL)
-        int l_selected_sockets = poll(l_worker->poll, l_worker->poll_count, -1);
-        size_t l_sockets_max = l_worker->poll_count;
+        l_selected_sockets = poll(l_worker->poll, l_worker->poll_count, -1);
+        l_sockets_max = l_worker->poll_count;
+#elif defined(DAP_EVENTS_CAPS_KQUEUE)
+        l_selected_sockets = kevent(l_worker->kqueue_fd,NULL,0,l_worker->kqueue_events,l_worker->kqueue_events_count_max,NULL);
+        l_sockets_max = l_worker->kqueue_events_count_max;
 #else
 #error "Unimplemented poll wait analog for this platform"
 #endif
@@ -171,32 +187,41 @@ void *dap_worker_thread(void *arg)
             bool l_flag_hup, l_flag_rdhup, l_flag_read, l_flag_write, l_flag_error, l_flag_nval, l_flag_msg, l_flag_pri;
 #ifdef DAP_EVENTS_CAPS_EPOLL
             l_cur = (dap_events_socket_t *) l_epoll_events[n].data.ptr;
-            uint32_t l_cur_events = l_epoll_events[n].events;
-            l_flag_hup      = l_cur_events & EPOLLHUP;
-            l_flag_rdhup    = l_cur_events & EPOLLRDHUP;
-            l_flag_write    = l_cur_events & EPOLLOUT;
-            l_flag_read     = l_cur_events & EPOLLIN;
-            l_flag_error    = l_cur_events & EPOLLERR;
-            l_flag_pri      = l_cur_events & EPOLLPRI;
+            uint32_t l_cur_flags = l_epoll_events[n].events;
+            l_flag_hup      = l_cur_flags & EPOLLHUP;
+            l_flag_rdhup    = l_cur_flags & EPOLLRDHUP;
+            l_flag_write    = l_cur_flags & EPOLLOUT;
+            l_flag_read     = l_cur_flags & EPOLLIN;
+            l_flag_error    = l_cur_flags & EPOLLERR;
+            l_flag_pri      = l_cur_flags & EPOLLPRI;
             l_flag_nval     = false;
 #elif defined ( DAP_EVENTS_CAPS_POLL)
-            short l_cur_events =l_worker->poll[n].revents;
+            short l_cur_flags =l_worker->poll[n].revents;
 
             if (l_worker->poll[n].fd == -1) // If it was deleted on previous iterations
                 continue;
 
-            if (!l_cur_events) // No events for this socket
+            if (!l_cur_flags) // No events for this socket
                 continue;
-            l_flag_hup =  l_cur_events& POLLHUP;
-            l_flag_rdhup = l_cur_events & POLLRDHUP;
-            l_flag_write = (l_cur_events & POLLOUT) || (l_cur_events &POLLRDNORM)|| (l_cur_events &POLLRDBAND ) ;
-            l_flag_read = l_cur_events & POLLIN || (l_cur_events &POLLWRNORM)|| (l_cur_events &POLLWRBAND );
-            l_flag_error = l_cur_events & POLLERR;
-            l_flag_nval = l_cur_events & POLLNVAL;
-            l_flag_pri = l_cur_events & POLLPRI;
-            l_flag_msg = l_cur_events & POLLMSG;
+            l_flag_hup =  l_cur_flags& POLLHUP;
+            l_flag_rdhup = l_cur_flags & POLLRDHUP;
+            l_flag_write = (l_cur_flags & POLLOUT) || (l_cur_flags &POLLRDNORM)|| (l_cur_flags &POLLRDBAND ) ;
+            l_flag_read = l_cur_flags & POLLIN || (l_cur_flags &POLLWRNORM)|| (l_cur_flags &POLLWRBAND );
+            l_flag_error = l_cur_flags & POLLERR;
+            l_flag_nval = l_cur_flags & POLLNVAL;
+            l_flag_pri = l_cur_flags & POLLPRI;
+            l_flag_msg = l_cur_flags & POLLMSG;
             l_cur = l_worker->poll_esocket[n];
             //log_it(L_DEBUG, "flags: returned events 0x%0X requested events 0x%0X",l_worker->poll[n].revents,l_worker->poll[n].events );
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+	    l_cur = (dap_events_socket_t*) l_worker->kqueue_events[n].udata;
+	    u_int l_cur_flags = l_worker->kqueue_events[n].fflags;
+            l_flag_write = l_cur_flags & EVFILT_WRITE;
+            l_flag_read  = l_cur_flags & EVFILT_READ;
+            if( !l_cur)
+        	continue;
+    	    l_cur->kqueue_event_catched = &l_worker->kqueue_events[n];
+
 #else
 #error "Unimplemented fetch esocket after poll"
 #endif
@@ -206,7 +231,7 @@ void *dap_worker_thread(void *arg)
             }
             if(s_debug_reactor) {
                 log_it(L_DEBUG, "Worker #%u esocket %p type %d fd=%d flags=0x%0X (%s:%s:%s:%s:%s:%s:%s:%s)", l_worker->id, l_cur, l_cur->type, l_cur->socket,
-                    l_cur_events, l_flag_read?"read":"", l_flag_write?"write":"", l_flag_error?"error":"",
+                    l_cur_flags, l_flag_read?"read":"", l_flag_write?"write":"", l_flag_error?"error":"",
                     l_flag_hup?"hup":"", l_flag_rdhup?"rdhup":"", l_flag_msg?"msg":"", l_flag_nval?"nval":"", l_flag_pri?"pri":"");
             }
 
@@ -546,7 +571,6 @@ void *dap_worker_thread(void *arg)
                         case DESCRIPTOR_TYPE_QUEUE:
                              if (l_cur->flags & DAP_SOCK_QUEUE_PTR && l_cur->buf_out_size>= sizeof (void*)){
 #if defined(DAP_EVENTS_CAPS_QUEUE_PIPE2)
-
                                 l_bytes_sent = write(l_cur->socket, l_cur->buf_out, sizeof (void *) ); // We send pointer by pointer
 #elif defined (DAP_EVENTS_CAPS_QUEUE_POSIX)
                                 l_bytes_sent = mq_send(a_es->mqd, (const char *)&a_arg,sizeof (a_arg),0);
@@ -590,6 +614,19 @@ void *dap_worker_thread(void *arg)
                                 l_errno = errno;
                                 if (l_bytes_sent == -1 && l_errno == EINVAL) // To make compatible with other
                                     l_errno = EAGAIN;                        // non-blocking sockets
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)                                    
+				struct kevent* l_event=&l_cur->kqueue_event;
+				void * l_ptr;
+				memcpy(l_ptr,l_cur->buf_out,sizeof(l_ptr) );
+				EV_SET(l_event,(uintptr_t) l_ptr, l_cur->kqueue_base_filter,l_cur->kqueue_base_flags, l_cur->kqueue_base_fflags,l_cur->kqueue_data, l_cur);
+				int l_n = kevent(l_worker->kqueue_fd,l_event,1,NULL,0,NULL);
+				if (l_n == 1)
+				    l_bytes_sent = sizeof(l_ptr);
+				else{
+				    l_errno = errno;
+				    log_it(L_WARNING,"queue ptr send error: kevent %p errno: %d", l_ptr, l_errno);
+				}
+                                    
 #else
 #error "Not implemented dap_events_socket_queue_ptr_send() for this platform"
 #endif
@@ -960,6 +997,18 @@ int dap_worker_add_events_socket_unsafe( dap_events_socket_t * a_esocket, dap_wo
     a_worker->poll_esocket[a_worker->poll_count] = a_esocket;
     a_worker->poll_count++;
     return 0;
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+	u_short l_flags = a_esocket->kqueue_base_flags;
+	u_int   l_fflags = a_esocket->kqueue_base_fflags;
+	short l_filter = a_esocket->kqueue_base_filter;
+        if(a_esocket->flags & DAP_SOCK_READY_TO_READ )
+            l_fflags |= NOTE_READ;
+        if(a_esocket->flags & DAP_SOCK_READY_TO_WRITE )
+            l_fflags |= NOTE_WRITE;
+            
+        EV_SET(&a_esocket->kqueue_event , a_esocket->socket, l_filter, EV_ADD| l_flags | EV_CLEAR, l_fflags,0, a_esocket);
+        return kevent ( a_worker->kqueue_fd,&a_esocket->kqueue_event,1,NULL,0,NULL)==1 ? 0 : -1 ;
+    
 #else
 #error "Unimplemented new esocket on worker callback for current platform"
 #endif
diff --git a/dap-sdk/net/core/include/dap_events_socket.h b/dap-sdk/net/core/include/dap_events_socket.h
index 5f1c83347f85db35751f0e567d2e0be1ad5b79a9..0732a3f7952086dceb937c68d205af11540a5081 100644
--- a/dap-sdk/net/core/include/dap_events_socket.h
+++ b/dap-sdk/net/core/include/dap_events_socket.h
@@ -55,10 +55,17 @@ typedef int SOCKET;
     #include <netinet/in.h>
     #include <sys/eventfd.h>
     #include <mqueue.h>
-#elif defined (DAP_OS_UNIX)
+#elif defined (DAP_OS_BSD)
     #define DAP_EVENTS_CAPS_KQUEUE
     #define DAP_EVENTS_CAPS_PIPE_POSIX
     #define DAP_EVENTS_CAPS_EVENT_KEVENT
+    #define DAP_EVENTS_CAPS_QUEUE_KEVENT
+    #include <netinet/in.h>
+    #include <sys/event.h>
+#elif defined (DAP_OS_UNIX)
+    #define DAP_EVENTS_CAPS_POLL
+    #define DAP_EVENTS_CAPS_PIPE_POSIX
+    #define DAP_EVENTS_CAPS_EVENT_PIPE
     #define DAP_EVENTS_CAPS_QUEUE_SOCKETPAIR
     #include <netinet/in.h>
 #elif defined (DAP_OS_WINDOWS)
@@ -161,7 +168,7 @@ typedef struct dap_events_socket {
         mqd_t mqd;
     };
     uint32_t mqd_id;
-#elif defined DAP_EVENTS_CAPS_MSMQ
+#elif defined(DAP_EVENTS_CAPS_MSMQ)
     };
     QUEUEHANDLE mqh, mqh_recv;
     u_int mq_num;
@@ -171,9 +178,8 @@ typedef struct dap_events_socket {
     };
 #endif
 
-#if defined DAP_EVENTS_CAPS_PIPE_POSIX
     int fd2;
-#endif
+
     dap_events_desc_type_t type;
     uint128_t uuid; // Unique UID
     // Related sockets (be careful - possible problems, delete them before )
@@ -232,6 +238,14 @@ typedef struct dap_events_socket {
 #elif defined (DAP_EVENTS_CAPS_POLL)
     short poll_base_flags;
     uint32_t poll_index; // index in poll array on worker
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    struct kevent kqueue_event;
+    struct kevent *kqueue_event_catched;
+    
+    short kqueue_base_filter;
+    unsigned short kqueue_base_flags;
+    unsigned int kqueue_base_fflags;
+    int64_t kqueue_data;
 #endif
 
     dap_events_socket_callbacks_t callbacks;
diff --git a/dap-sdk/net/core/include/dap_proc_thread.h b/dap-sdk/net/core/include/dap_proc_thread.h
index 8f307aff592748193aad5e24a0c057ebd163302a..caf4ea2e5a155c64996c185025c7432a47a61f46 100644
--- a/dap-sdk/net/core/include/dap_proc_thread.h
+++ b/dap-sdk/net/core/include/dap_proc_thread.h
@@ -52,6 +52,10 @@ typedef struct dap_proc_thread{
     dap_events_socket_t ** esockets;
     size_t poll_count;
     size_t poll_count_max;
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    int kqueue_fd;
+    struct kevent * kqueue_events;
+    int kqueue_events_count_max; 
 #else
 #error "No poll for proc thread for your platform"
 #endif
@@ -76,4 +80,4 @@ typedef void (*dap_proc_worker_callback_t)(dap_worker_t *,void *);
 
 void dap_proc_thread_worker_exec_callback(dap_proc_thread_t * a_thread, size_t a_worker_id, dap_proc_worker_callback_t a_callback, void * a_arg);
 
-dap_proc_thread_t * dap_proc_thread_assign_esocket_unsafe(dap_proc_thread_t * a_thread, dap_events_socket_t * a_esocket);
+int dap_proc_thread_assign_esocket_unsafe(dap_proc_thread_t * a_thread, dap_events_socket_t * a_esocket);
diff --git a/dap-sdk/net/core/include/dap_server.h b/dap-sdk/net/core/include/dap_server.h
index a8d065f5ad467c398dde8006c1e64db605a66895..94c7d503bc0f074c9b4b47c217138f55db674c2b 100644
--- a/dap-sdk/net/core/include/dap_server.h
+++ b/dap-sdk/net/core/include/dap_server.h
@@ -43,6 +43,8 @@
 #define MSG_DONTWAIT 0
 #define MSG_NOSIGNAL 0
 #include "winsock.h"
+#elif defined(DAP_OS_BSD)
+
 #else
 #error "No poll headers for your platform"
 #endif
diff --git a/dap-sdk/net/core/include/dap_timerfd.h b/dap-sdk/net/core/include/dap_timerfd.h
index d6c7f003f710e00eb2fc273c73c4240109a0273e..700e5a2bc3c8f1aaa1d0b0165d4518d2a018a7cb 100644
--- a/dap-sdk/net/core/include/dap_timerfd.h
+++ b/dap-sdk/net/core/include/dap_timerfd.h
@@ -27,9 +27,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#if defined DAP_OS_UNIX
+
+
+#if defined DAP_OS_LINUX
 #include <sys/time.h>
 #include <sys/timerfd.h>
+#elif defined DAP_OS_BSD
+#include <sys/event.h>
 #elif defined DAP_OS_WINDOWS
 #define _MSEC -10000
 #endif
@@ -46,7 +50,7 @@ typedef struct dap_timerfd {
 #ifdef DAP_OS_WINDOWS
 	SOCKET tfd;
     u_short port;
-#else
+#elif defined(DAP_OS_LINUX)
     int tfd; //timer file descriptor
 #endif
     dap_events_socket_t *events_socket;
diff --git a/dap-sdk/net/core/include/dap_worker.h b/dap-sdk/net/core/include/dap_worker.h
index a88a9e459913207d89365ceec55f25d8ad8834bc..df6591861d9df985c64b6e13567e22f1c3f1c7c8 100644
--- a/dap-sdk/net/core/include/dap_worker.h
+++ b/dap-sdk/net/core/include/dap_worker.h
@@ -65,6 +65,7 @@ typedef struct dap_worker
 #if defined DAP_EVENTS_CAPS_MSMQ
     HANDLE msmq_events[MAXIMUM_WAIT_OBJECTS];
 #endif
+
 #if defined DAP_EVENTS_CAPS_EPOLL
     EPOLL_HANDLE epoll_fd;
 #elif defined ( DAP_EVENTS_CAPS_POLL)
@@ -74,6 +75,12 @@ typedef struct dap_worker
     size_t poll_count;
     size_t poll_count_max;
     bool poll_compress; // Some of fd's became NULL so arrays need to be reassigned
+#elif defined (DAP_EVENTS_CAPS_KQUEUE)
+    int kqueue_fd;
+    struct kevent * kqueue_events;
+    int kqueue_events_count_max;
+#else
+#error "Not defined worker for your platform"
 #endif
     pthread_cond_t started_cond;
     pthread_mutex_t started_mutex;
diff --git a/dap-sdk/net/server/notify_server/src/dap_notify_srv.c b/dap-sdk/net/server/notify_server/src/dap_notify_srv.c
index f7c3718ed51c634e9435d9bf2396a94531e1d9b1..ca8f8e226d6e8975f3dd8b468204a6b4cdf92c58 100644
--- a/dap-sdk/net/server/notify_server/src/dap_notify_srv.c
+++ b/dap-sdk/net/server/notify_server/src/dap_notify_srv.c
@@ -81,7 +81,6 @@ int dap_notify_server_init()
     if (!s_notify_server)
         return -1;
 
-
     s_notify_server_queue = dap_events_socket_create_type_queue_ptr_mt(dap_events_worker_get_auto(),s_notify_server_callback_queue);
     uint32_t l_workers_count = dap_events_worker_get_count();
     s_notify_server_queue_inter = DAP_NEW_Z_SIZE(dap_events_socket_t*,sizeof (dap_events_socket_t*)*l_workers_count );
@@ -102,6 +101,15 @@ void dap_notify_server_deinit()
 
 }
 
+/**
+ * @brief dap_notify_server_create_inter
+ * @return
+ */
+struct dap_events_socket * dap_notify_server_create_inter()
+{
+    return NULL;
+}
+
 /**
  * @brief dap_notify_server_send_fmt_inter
  * @param a_input
diff --git a/dap-sdk/net/stream/session/CMakeLists.txt b/dap-sdk/net/stream/session/CMakeLists.txt
index ed0b1aa1e34038551185ae862b0c0dbeefd5f19f..2e294300da2e7b203c1403803eef390997cef1e8 100644
--- a/dap-sdk/net/stream/session/CMakeLists.txt
+++ b/dap-sdk/net/stream/session/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_session)
   
 file(GLOB SESSION_SRCS *.c)
diff --git a/dap-sdk/net/stream/stream/CMakeLists.txt b/dap-sdk/net/stream/stream/CMakeLists.txt
index b7c54daefb0b52860331261b459e0dd46a4ab87c..264d313f849dffb88781867be56710f669e8ba39 100755
--- a/dap-sdk/net/stream/stream/CMakeLists.txt
+++ b/dap-sdk/net/stream/stream/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_stream)
   
 file(GLOB STREAM_SRCS *.c)
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 78ae7458e46f0e6299760dea12620a7d16d03a5b..6023ff47176c12cde5637787a1b5c42cc5302b94 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -73,8 +73,10 @@ if (CELLFRAME_MODULES MATCHES "srv-datum")
 endif()
 
 # Service VPN
-if (CELLFRAME_MODULES MATCHES "srv-vpn")
-    add_subdirectory(service/vpn)
+if(LINUX)
+    if (CELLFRAME_MODULES MATCHES "srv-vpn")
+	add_subdirectory(service/vpn)
+    endif()
 endif()
 
 # Service eXchange
diff --git a/modules/app-cli/CMakeLists.txt b/modules/app-cli/CMakeLists.txt
index d03500dfa560b84156aa05321f6f3bb5910a78a2..1e8f2c8be518be56301457cab24adfb726fea8e7 100644
--- a/modules/app-cli/CMakeLists.txt
+++ b/modules/app-cli/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_app_cli)
 
 file(GLOB DAP_APP_CLI_SRCS *.c)
diff --git a/modules/chain/CMakeLists.txt b/modules/chain/CMakeLists.txt
index 4b078e198a4d7ccbaeff20b032c74f96cb81542a..2c8a19c4eccc17aa4e1b6d4ec75d6af4ca321061 100644
--- a/modules/chain/CMakeLists.txt
+++ b/modules/chain/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain)
   
 file(GLOB DAP_CHAIN_SRCS *.c)
diff --git a/modules/chain/dap_chain.c b/modules/chain/dap_chain.c
index b6b5943cad3e12664e182183cd356d6cf3ae1ddb..d117a5ef9b04728870e713df1a5336cf9aac2de4 100644
--- a/modules/chain/dap_chain.c
+++ b/modules/chain/dap_chain.c
@@ -24,7 +24,7 @@
 #include <dap_chain_ledger.h>
 #include <sys/types.h>
 #include <dirent.h>
-#ifdef DAP_OS_UNIX
+#ifdef DAP_OS_LINUX
 #include <stdc-predef.h>
 #endif
 #include <unistd.h>
diff --git a/modules/common/CMakeLists.txt b/modules/common/CMakeLists.txt
index ca8333ab03e16b5ae470fe7bcf8680960d651d28..1a4670336869329ba4f7dff0936354b4ad674119 100644
--- a/modules/common/CMakeLists.txt
+++ b/modules/common/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_common)
   
 file(GLOB DAP_CHAIN_COMMON_SRCS  *.c)
diff --git a/modules/consensus/block-poa/CMakeLists.txt b/modules/consensus/block-poa/CMakeLists.txt
index 5fa1723d36631a5eee78da397da503251f712eaa..d54c77b2d6de9a7bded5e2f4413b1791d87fb394 100644
--- a/modules/consensus/block-poa/CMakeLists.txt
+++ b/modules/consensus/block-poa/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_block_poa)
 
 file(GLOB DAP_CHAIN_BLOCK_CS_POA_SRCS *.c)
diff --git a/modules/consensus/block-pow/CMakeLists.txt b/modules/consensus/block-pow/CMakeLists.txt
index f5a3d6635e6357d409602af0290dc56f4bc1b8ae..334d6eb6a932e22561749a8c14863ac1f7ca81d2 100644
--- a/modules/consensus/block-pow/CMakeLists.txt
+++ b/modules/consensus/block-pow/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_block_pow)
   
 file(GLOB DAP_CHAIN_BLOCK_CS_POW_SRCS *.c)
diff --git a/modules/consensus/dag-poa/CMakeLists.txt b/modules/consensus/dag-poa/CMakeLists.txt
index 31fc6df103af2c9463cf3af658efe03dc6c4134f..12a35ad6a4d8dbf26282473857fc694eb38d5ce6 100644
--- a/modules/consensus/dag-poa/CMakeLists.txt
+++ b/modules/consensus/dag-poa/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_dag_poa)
   
 file(GLOB DAP_CHAIN_DAG_CS_POA_SRCS *.c)
diff --git a/modules/consensus/dag-pos/CMakeLists.txt b/modules/consensus/dag-pos/CMakeLists.txt
index c540c2c5615a9fe31d05e7ff5d4d7c1ff185cc6a..8b8dff9976790e5c248009bed61dd6736fe6b216 100644
--- a/modules/consensus/dag-pos/CMakeLists.txt
+++ b/modules/consensus/dag-pos/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_dag_pos)
   
 file(GLOB DAP_CHAIN_CS_DAG_POS_SRCS *.c)
diff --git a/modules/consensus/none/CMakeLists.txt b/modules/consensus/none/CMakeLists.txt
index 5ca183966085b044e7fc63c77e5d689051f953f8..8072379783889069d785c766ac0b10d786fbe24c 100644
--- a/modules/consensus/none/CMakeLists.txt
+++ b/modules/consensus/none/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_none)
   
 file(GLOB DAP_CHAIN_CS_NO_CONSENSUS_SRC *.c)
diff --git a/modules/global-db/dap_chain_global_db_driver.c b/modules/global-db/dap_chain_global_db_driver.c
index 3edae6e40c07234ab5b1285a275844f546405864..b596f92b176ba99e362d115bb775dcecbc843d1b 100644
--- a/modules/global-db/dap_chain_global_db_driver.c
+++ b/modules/global-db/dap_chain_global_db_driver.c
@@ -547,7 +547,7 @@ int dap_chain_global_db_driver_appy(pdap_store_obj_t a_store_obj, size_t a_store
                 l_ret = 1;
             }
             if(l_ret_tmp < 0) {
-                log_it(L_ERROR, "Can't write item %s/%s\n", l_store_obj_cur->group, l_store_obj_cur->key);
+                log_it(L_ERROR, "Can't write item %s/%s (code %d)\n", l_store_obj_cur->group, l_store_obj_cur->key, l_ret_tmp);
                 l_ret -= 1;
             }
         }
diff --git a/modules/global-db/dap_chain_global_db_driver_cdb.c b/modules/global-db/dap_chain_global_db_driver_cdb.c
index 5b54ead3ac2c4e825124da39d9e5027865f23eb7..d22d8759a303932481e88db1010ff67f36f8b02c 100644
--- a/modules/global-db/dap_chain_global_db_driver_cdb.c
+++ b/modules/global-db/dap_chain_global_db_driver_cdb.c
@@ -197,12 +197,18 @@ int dap_db_driver_cdb_init(const char *a_cdb_path, dap_db_driver_callbacks_t *a_
 #ifdef _DIRENT_HAVE_D_TYPE
         if (d->d_type != DT_DIR)
             continue;
-#else
+#elif defined(DAP_OS_LINUX)
         struct _stat buf;
         int res = _stat(d->d_name, &buf);
         if (!S_ISDIR(buf.st_mode) || !res) {
             continue;
         }
+#elif defined (DAP_OS_BSD)        
+        struct stat buf;
+        int res = stat(d->d_name, &buf);
+        if (!S_ISDIR(buf.st_mode) || !res) {
+            continue;
+        }
 #endif
         if (!dap_strcmp(d->d_name, ".") || !dap_strcmp(d->d_name, "..")) {
             continue;
@@ -268,26 +274,10 @@ int dap_db_driver_cdb_flush(void) {
     cdb_instance *cur_cdb, *tmp;
     pthread_rwlock_rdlock(&cdb_rwlock);
     HASH_ITER(hh, s_cdb, cur_cdb, tmp) {
-        cdb_close(cur_cdb->cdb);
-        char l_cdb_path[strlen(s_cdb_path) + strlen(cur_cdb->local_group) + 2];
-        memset(l_cdb_path, '\0', sizeof(l_cdb_path));
-        dap_snprintf(l_cdb_path, sizeof(l_cdb_path), "%s/%s", s_cdb_path, cur_cdb->local_group);
-// Re-application of options might be required
-        cdb_options l_opts = { 1000000, 128, 1024 };
-        if (cdb_option(cur_cdb->cdb, l_opts.hsize, l_opts.pcacheMB, l_opts.rcacheMB) != CDB_SUCCESS) {
-            log_it(L_ERROR, "Options are inacceptable: \"%s\"", cdb_errmsg(cdb_errno(cur_cdb->cdb)));
-            ret = -1;
-            goto RET;
-        }
-        if(cdb_open(cur_cdb->cdb, l_cdb_path, CDB_CREAT | CDB_PAGEWARMUP) != CDB_SUCCESS) {
-            log_it(L_ERROR, "An error occured while opening CDB: \"%s\"", cdb_errmsg(cdb_errno(cur_cdb->cdb)));
-            ret = -2;
-            goto RET;
-        }
+        cdb_flushalldpage(cur_cdb->cdb);
     }
-    log_it(L_DEBUG, "All data dumped");
-RET:
     pthread_rwlock_unlock(&cdb_rwlock);
+    log_it(L_DEBUG, "All data dumped");
     return ret;
 }
 
diff --git a/modules/global-db/include/dap_chain_global_db_driver_cdb.h b/modules/global-db/include/dap_chain_global_db_driver_cdb.h
index f3e036f19f91e333d5a5797632b788523db3ee07..b5011f6bb97ff334629f8b6aa5115b25999d7a4c 100644
--- a/modules/global-db/include/dap_chain_global_db_driver_cdb.h
+++ b/modules/global-db/include/dap_chain_global_db_driver_cdb.h
@@ -22,7 +22,8 @@
  along with any DAP based project.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "cuttdb.h"
+//#include "cuttdb.h"
+#include "cdb_core.h"
 #include "dap_chain_global_db_driver.h"
 #define UNUSED(x) (void)(x)
 
diff --git a/modules/mempool/CMakeLists.txt b/modules/mempool/CMakeLists.txt
index 51d0dca27423c67b69a9c3055dee922f1f6f12e8..f2d0ede23294b502d943b706cbcc2273f1242944 100644
--- a/modules/mempool/CMakeLists.txt
+++ b/modules/mempool/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_mempool)
 
 file(GLOB DAP_CHAIN_MEMPOOL_SRC *.c)
diff --git a/modules/mining/CMakeLists.txt b/modules/mining/CMakeLists.txt
index d42442f1fd37ed497deb13689a973741e7546601..3e4e88edd39b4acd5e3154d799278b2d20b83112 100644
--- a/modules/mining/CMakeLists.txt
+++ b/modules/mining/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_crypto)
   
 file(GLOB DAP_CHAIN_CRYPTO_SRCS *.c)
diff --git a/modules/modules_dynamic/cdb/CMakeLists.txt b/modules/modules_dynamic/cdb/CMakeLists.txt
index a53e016eb7276048420e90c215c72ee992acd1da..26191138031bebd9bbaf9c54dc4fee0b0b519df5 100644
--- a/modules/modules_dynamic/cdb/CMakeLists.txt
+++ b/modules/modules_dynamic/cdb/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_modules_dynamic_cdb)
 
 file(GLOB DAP_MODULES_DYNAMIC_CDB_SRCS *.c)
diff --git a/modules/net/CMakeLists.txt b/modules/net/CMakeLists.txt
index 79fe22e7ca3af9e83bedde9928c07f83c8fb533c..d64767b6181ae6aec139d0dc0c65d1f0ed77b055 100644
--- a/modules/net/CMakeLists.txt
+++ b/modules/net/CMakeLists.txt
@@ -16,7 +16,7 @@ set(IPUTILS_INCLUDE_DIRS
      ../../3rdparty/iputils/traceroute/
     )
 
-if(NOT (WIN32))
+if(LINUX)
   file(GLOB IPUTILS_SRCS ../../3rdparty/iputils/*.c ../../3rdparty/iputils/traceroute/*.c)
   file(GLOB IPUTILS_HEADERS ../../3rdparty/iputils/*.h ${IPUTILS_INCLUDE_DIRS}*.h)
 endif()
@@ -41,12 +41,16 @@ if(WIN32)
                             dap_chain_mempool dap_chain_global_db dap_chain_net_srv_stake dap_chain_cs_none)
 endif()
 
-if(UNIX)
+if(LINUX)
     target_link_libraries(${PROJECT_NAME} dap_core dap_crypto dap_client dap_stream_ch_chain dap_stream_ch_chain_net dap_stream_ch_chain_net_srv dap_chain
       dap_chain_wallet dap_chain_net_srv dap_chain_mempool dap_chain_global_db dap_chain_net_srv_stake dap_chain_cs_none
       resolv )
+elseif(BSD)
+    target_link_libraries(${PROJECT_NAME} dap_core dap_crypto dap_client dap_stream_ch_chain dap_stream_ch_chain_net dap_stream_ch_chain_net_srv dap_chain
+      dap_chain_wallet dap_chain_net_srv dap_chain_mempool dap_chain_global_db dap_chain_net_srv_stake dap_chain_cs_none )
 endif()
 
+
 target_include_directories(${PROJECT_NAME} INTERFACE . )
 target_include_directories(${PROJECT_NAME} PUBLIC include)
 target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/uthash/src)
diff --git a/modules/net/dap_chain_net.c b/modules/net/dap_chain_net.c
index f960f04113bc45e2e1a3f70b1eef79f9441ff2a8..20b306afc403cbe58d4dd837ac2e9fcf55e9d579 100644
--- a/modules/net/dap_chain_net.c
+++ b/modules/net/dap_chain_net.c
@@ -1648,7 +1648,12 @@ int s_net_load(const char * a_net_name, uint16_t a_acl_idx)
 
                     l_node_info->hdr.address.uint64 = l_seed_node_addr->uint64;
                     if ( l_node_info->hdr.ext_addr_v4.s_addr ||
-                            l_node_info->hdr.ext_addr_v6.s6_addr32[0] ){
+                    #ifdef DAP_OS_BSD
+                	l_node_info->hdr.ext_addr_v6.__u6_addr.__u6_addr32[0]
+                    #else
+                        l_node_info->hdr.ext_addr_v6.s6_addr32[0] 
+                    #endif
+                            ){
                         int l_ret;
                         if ( (l_ret = dap_chain_node_info_save(l_net, l_node_info)) ==0 ){
                             if (dap_chain_node_alias_register(l_net,l_net_pvt->seed_aliases[i],l_seed_node_addr))
diff --git a/modules/net/dap_chain_node_cli_cmd.c b/modules/net/dap_chain_node_cli_cmd.c
index 66a3ec2f72b2d10878b57ef4f61d3c5841809cd4..41a6d8d0fc2b4a221b81f3ee70d2d74a4d2231ac 100644
--- a/modules/net/dap_chain_node_cli_cmd.c
+++ b/modules/net/dap_chain_node_cli_cmd.c
@@ -1274,7 +1274,7 @@ int com_node(int a_argc, char ** a_argv, void *arg_func, char **a_str_reply)
  */
 int com_traceroute(int argc, char** argv, void *arg_func, char **str_reply)
 {
-#ifndef _WIN32
+#ifdef DAP_OS_LINUX
     const char *addr = NULL;
     int hops = 0, time_usec = 0;
     if(argc > 1)
@@ -1342,8 +1342,10 @@ int com_traceroute(int argc, char** argv, void *arg_func, char **str_reply)
         }
     }
     return res;
+#else
+    dap_chain_node_cli_set_reply_text(str_reply, "Not realized for your platform");
+    return -1;
 #endif
-    return 0;
 }
 
 /**
@@ -1353,7 +1355,7 @@ int com_traceroute(int argc, char** argv, void *arg_func, char **str_reply)
  */
 int com_tracepath(int argc, char** argv, void *arg_func, char **str_reply)
 {
-#ifndef _WIN32
+#ifdef DAP_OS_LINUX
     const char *addr = NULL;
     int hops = 0, time_usec = 0;
     if(argc > 1)
@@ -1416,8 +1418,10 @@ int com_tracepath(int argc, char** argv, void *arg_func, char **str_reply)
         }
     }
     return res;
+#else
+        dap_chain_node_cli_set_reply_text(str_reply, "Not realized for your platform");
+        return -1;
 #endif
-    return 0;
 }
 
 /**
@@ -1427,7 +1431,7 @@ int com_tracepath(int argc, char** argv, void *arg_func, char **str_reply)
  */
 int com_ping(int argc, char** argv, void *arg_func, char **str_reply)
 {
-#ifndef _WIN32
+#ifdef DAP_OS_LINUX
 
     int n = 4;
     if(argc < 2) {
@@ -1480,8 +1484,10 @@ int com_ping(int argc, char** argv, void *arg_func, char **str_reply)
         }
     }
     return res;
+#else
+        dap_chain_node_cli_set_reply_text(str_reply, "Not realized for your platform");
+        return -1;
 #endif
-    return 0;
 }
 
 /**
diff --git a/modules/net/dap_chain_node_dns_client.c b/modules/net/dap_chain_node_dns_client.c
index 29c91805a73bc1782e83f7982878d3ecfcc59495..2ed130ddc37e8712b96d3f3eface9f4dc32a6292 100644
--- a/modules/net/dap_chain_node_dns_client.c
+++ b/modules/net/dap_chain_node_dns_client.c
@@ -67,7 +67,7 @@ static void s_dns_client_esocket_read_callback(dap_events_socket_t * a_esocket,
     size_t l_addr_point = DNS_HEADER_SIZE + strlen(l_dns_client->name) + 2 + 2 * sizeof(uint16_t) + DNS_ANSWER_SIZE - sizeof(uint32_t);
     if (l_recieved < l_addr_point + sizeof(uint32_t)) {
         log_it(L_WARNING, "DNS answer incomplete");
-        l_dns_client->callback_error(a_esocket->worker, l_dns_client->result,l_dns_client->callbacks_arg,EREMOTEIO );
+        l_dns_client->callback_error(a_esocket->worker, l_dns_client->result,l_dns_client->callbacks_arg,EIO );
         l_dns_client->is_callbacks_called = true;
         a_esocket->flags |= DAP_SOCK_SIGNAL_CLOSE;
         a_esocket->buf_in_size = a_esocket->buf_out_size = 0;
@@ -77,7 +77,7 @@ static void s_dns_client_esocket_read_callback(dap_events_socket_t * a_esocket,
     int l_answers_count = ntohs(*(uint16_t *)l_cur);
     if (l_answers_count != 1) {
         log_it(L_WARNING, "Incorrect DNS answer format");
-        l_dns_client->callback_error(a_esocket->worker, l_dns_client->result,l_dns_client->callbacks_arg,EMEDIUMTYPE );
+        l_dns_client->callback_error(a_esocket->worker, l_dns_client->result,l_dns_client->callbacks_arg,EINVAL);
         l_dns_client->is_callbacks_called = true;
         a_esocket->flags |= DAP_SOCK_SIGNAL_CLOSE;
         a_esocket->buf_in_size = a_esocket->buf_out_size = 0;
diff --git a/modules/net/dap_chain_node_ping.c b/modules/net/dap_chain_node_ping.c
index e21b1bc78389b5698eec6d4bb9d3f47d5045c103..353bf48fe476ce73cd43c8d7a77d6cca992037a0 100644
--- a/modules/net/dap_chain_node_ping.c
+++ b/modules/net/dap_chain_node_ping.c
@@ -217,7 +217,7 @@ static void* node_ping_background_proc(void *a_arg)
         if(!str_ip4)
             continue;
         int hops = 0, time_usec = 0;
-#ifndef _WIN32
+#ifdef DAP_OS_LINUX
         int res = traceroute_util(str_ip4, &hops, &time_usec);
 #endif
         DAP_DELETE(host4);
diff --git a/modules/net/srv/CMakeLists.txt b/modules/net/srv/CMakeLists.txt
index 5c90fdf9672bb106e13547100b810680109356a8..54d4d4be1e00cd1770be36a7ec91907aa3e86b9d 100644
--- a/modules/net/srv/CMakeLists.txt
+++ b/modules/net/srv/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_net_srv)
 
 file(GLOB DAP_CHAIN_NET_SRV_SRCS *.c libmaxminddb/*.c)
diff --git a/modules/service/app-db/CMakeLists.txt b/modules/service/app-db/CMakeLists.txt
index 07e2032d562fe8b001b9fcf9d999d9ff09de3cfe..f6a7a7977d843e1bdf10796aa8c237534a916ec5 100644
--- a/modules/service/app-db/CMakeLists.txt
+++ b/modules/service/app-db/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_net_srv_app_db)
   
 file(GLOB DAP_CHAIN_NET_SRV_APP_DB_SRCS *.c)
diff --git a/modules/service/app/CMakeLists.txt b/modules/service/app/CMakeLists.txt
index 621083712d84e1f2a9bbe257dd4a26b4c5ed7b1a..e69756a8d304d41eb5e220462288a86aead5c742 100644
--- a/modules/service/app/CMakeLists.txt
+++ b/modules/service/app/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_net_srv_app)
   
 file(GLOB DAP_CHAIN_NET_SRV_APP_SRCS  *.c)
diff --git a/modules/service/datum/CMakeLists.txt b/modules/service/datum/CMakeLists.txt
index 63a0dcfb2e0869110192fc05fe70642b778f4e6f..dc38274497865096fdd8e21bb576dc99cf373ef0 100644
--- a/modules/service/datum/CMakeLists.txt
+++ b/modules/service/datum/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_net_srv_datum)
   
 file(GLOB DAP_CHAIN_NET_SRV_DATUM_SRCS *.c)
diff --git a/modules/service/mining-pool/CMakeLists.txt b/modules/service/mining-pool/CMakeLists.txt
index 449cc79415cb2064ace36198131a21ce364b9053..6a66f485f3affa09480ab6930c8b3f153c44afd5 100644
--- a/modules/service/mining-pool/CMakeLists.txt
+++ b/modules/service/mining-pool/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_net_srv_datum_pool)
   
 file(GLOB DAP_CHAIN_NET_SRV_DATUM_POOL_SRCS *.c)
diff --git a/modules/service/stake/CMakeLists.txt b/modules/service/stake/CMakeLists.txt
index 6c00ea94b1b832f9ee0023e01e4f901ec06ac06f..e50be120ada65584254d16bd4a764594052e0722 100644
--- a/modules/service/stake/CMakeLists.txt
+++ b/modules/service/stake/CMakeLists.txt
@@ -1,12 +1,12 @@
-cmake_minimum_required(VERSION 2.8)
-project (dap_chain_net_srv_stake)
-  
-file(GLOB DAP_SRV_STAKE_SRCS *.c)
-
-file(GLOB DAP_SRV_STAKE_HEADERS include/*.h)
-
-add_library(${PROJECT_NAME} STATIC ${DAP_SRV_STAKE_SRCS} ${DAP_SRV_STAKE_HEADERS})
-
-target_include_directories(dap_chain_crypto INTERFACE .)
-target_include_directories(${PROJECT_NAME} PUBLIC include)
-target_link_libraries(${PROJECT_NAME} dap_core dap_crypto dap_chain dap_chain_crypto dap_chain_net dap_chain_net_srv)
+cmake_minimum_required(VERSION 3.0)
+project (dap_chain_net_srv_stake)
+
+file(GLOB DAP_SRV_STAKE_SRCS *.c)
+
+file(GLOB DAP_SRV_STAKE_HEADERS include/*.h)
+
+add_library(${PROJECT_NAME} STATIC ${DAP_SRV_STAKE_SRCS} ${DAP_SRV_STAKE_HEADERS})
+
+target_include_directories(dap_chain_crypto INTERFACE .)
+target_include_directories(${PROJECT_NAME} PUBLIC include)
+target_link_libraries(${PROJECT_NAME} dap_core dap_crypto dap_chain dap_chain_crypto dap_chain_net dap_chain_net_srv)
diff --git a/modules/service/vpn/CMakeLists.txt b/modules/service/vpn/CMakeLists.txt
index cc6897553f82995beeec606c34cdcd2473a731dc..9383e4082f41b431e5e78cd81f503898e1d44044 100644
--- a/modules/service/vpn/CMakeLists.txt
+++ b/modules/service/vpn/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_net_srv_vpn)
   
 file(GLOB DAP_CHAIN_NET_SRV_VPN_SRCS *.c)
diff --git a/modules/service/vpn/dap_chain_net_srv_vpn.c b/modules/service/vpn/dap_chain_net_srv_vpn.c
index d4a4c60020ba70895892d342c54b77cbc7c96c28..8cff46d7a926d92b55e09367a72a955d20133b1a 100644
--- a/modules/service/vpn/dap_chain_net_srv_vpn.c
+++ b/modules/service/vpn/dap_chain_net_srv_vpn.c
@@ -24,17 +24,26 @@
     along with any DAP based project.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#ifdef DAP_OS_UNIX
+#ifdef DAP_OS_LINUX
 #include <netinet/in.h>
 #include <linux/if.h>
 #include <linux/if_tun.h>
 #include <sys/ioctl.h>
+#include <sys/epoll.h>
+#endif
+
+#ifdef DAP_OS_BSD
+#include <netinet/in.h>
+#include <net/if.h>
+#include <net/if_tun.h>
+#include <sys/ioctl.h>
 #endif
 
+
+
 #include <sys/select.h>
 #include <sys/types.h>
 #include <sys/socket.h>
-#include <sys/epoll.h>
 #include <sys/un.h>
 
 #include <netinet/in.h>
diff --git a/modules/service/vpn/include/dap_chain_net_srv_vpn_cdb_server_list.h b/modules/service/vpn/include/dap_chain_net_srv_vpn_cdb_server_list.h
index ce8da913514d543fe562a4385887d21575d83a9c..6978da7a5bee68d591b8b16925e5cf7f05e82498 100644
--- a/modules/service/vpn/include/dap_chain_net_srv_vpn_cdb_server_list.h
+++ b/modules/service/vpn/include/dap_chain_net_srv_vpn_cdb_server_list.h
@@ -37,4 +37,5 @@ int dap_chain_net_srv_vpn_cdb_server_list_init(void);
 void dap_chain_net_srv_vpn_cdb_server_list_deinit(void);
 void dap_chain_net_srv_vpn_cdb_server_list_add_proc(struct dap_http * sh, const char * url);
 
+int dap_chain_net_srv_vpn_cdb_server_list_static_create(dap_chain_net_t *a_net);
 
diff --git a/modules/service/xchange/CMakeLists.txt b/modules/service/xchange/CMakeLists.txt
index 0d2888afb0c413051f9beceded4c57e21a9c1b23..82d036e96bc5d37317cd91cb7ad5d5df4bc53515 100644
--- a/modules/service/xchange/CMakeLists.txt
+++ b/modules/service/xchange/CMakeLists.txt
@@ -1,12 +1,12 @@
-cmake_minimum_required(VERSION 2.8)
-project (dap_chain_net_srv_xchange)
-  
-file(GLOB DAP_SRV_XCHANGE_SRCS *.c)
-
-file(GLOB DAP_SRV_XCHANGE_HEADERS include/*.h)
-
-add_library(${PROJECT_NAME} STATIC ${DAP_SRV_XCHANGE_SRCS} ${DAP_SRV_XCHANGE_HEADERS})
-
-target_include_directories(dap_chain_crypto INTERFACE .)
-target_include_directories(${PROJECT_NAME} PUBLIC include)
-target_link_libraries(${PROJECT_NAME} dap_core dap_crypto dap_chain dap_chain_crypto dap_chain_net dap_chain_net_srv)
+cmake_minimum_required(VERSION 3.0)
+project (dap_chain_net_srv_xchange)
+  
+file(GLOB DAP_SRV_XCHANGE_SRCS *.c)
+
+file(GLOB DAP_SRV_XCHANGE_HEADERS include/*.h)
+
+add_library(${PROJECT_NAME} STATIC ${DAP_SRV_XCHANGE_SRCS} ${DAP_SRV_XCHANGE_HEADERS})
+
+target_include_directories(dap_chain_crypto INTERFACE .)
+target_include_directories(${PROJECT_NAME} PUBLIC include)
+target_link_libraries(${PROJECT_NAME} dap_core dap_crypto dap_chain dap_chain_crypto dap_chain_net dap_chain_net_srv)
diff --git a/modules/type/blocks/CMakeLists.txt b/modules/type/blocks/CMakeLists.txt
index 38bae326446f2e224c1259f28dbcfffcadae35ba..44f0787add4f670f935894cffad47fbe58c4a33b 100644
--- a/modules/type/blocks/CMakeLists.txt
+++ b/modules/type/blocks/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_blocks)
   
 file(GLOB DAP_CHAIN_BLOCK_SRCS *.c)
diff --git a/modules/type/dag/CMakeLists.txt b/modules/type/dag/CMakeLists.txt
index c5a23236ee181637f890c06c539b8513bc60d6f3..7ba78b85c68258118ad2b67a6b64c8ff1052c7be 100644
--- a/modules/type/dag/CMakeLists.txt
+++ b/modules/type/dag/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_cs_dag)
   
 file(GLOB DAP_CHAIN_DAG_SRCS *.c)
diff --git a/modules/wallet/CMakeLists.txt b/modules/wallet/CMakeLists.txt
index 09fb4ddae672621fc7e43170fe2b545fd202906d..65e9b46203484dadf47906ee5895639539a9d70f 100644
--- a/modules/wallet/CMakeLists.txt
+++ b/modules/wallet/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 project (dap_chain_wallet)
   
 file(GLOB DAP_CHAIN_WALLET_SRCS *.c)