From dfc5302017c60553c25ad59ac9191b20de237c53 Mon Sep 17 00:00:00 2001
From: Dmitriy Gerasimov <naeper@demlabs.net>
Date: Mon, 8 Feb 2021 21:54:41 +0700
Subject: [PATCH] [+] More BSD fixes

---
 dap-sdk/crypto/CMakeLists.txt                |   31 +-
 dap-sdk/crypto/include/dap_crypto_common.h   |    3 +
 dap-sdk/crypto/src/msrln/AMD64/consts.c      |   80 +-
 dap-sdk/crypto/src/msrln/AMD64/error_asm.S   |  872 ++++----
 dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c     |  130 +-
 dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S | 1956 ++++++++---------
 dap-sdk/crypto/src/msrln/kex.c               | 1287 ++++++-----
 dap-sdk/crypto/src/msrln/makefile            |  188 +-
 dap-sdk/crypto/src/msrln/msrln.h             |  272 +--
 dap-sdk/crypto/src/msrln/msrln.pri           |   12 +-
 dap-sdk/crypto/src/msrln/msrln_priv.h        |  228 +-
 dap-sdk/crypto/src/msrln/random.c            |  180 +-
 dap-sdk/crypto/src/sig_picnic/picnic_impl.c  | 1996 +++++++++---------
 13 files changed, 3628 insertions(+), 3607 deletions(-)

diff --git a/dap-sdk/crypto/CMakeLists.txt b/dap-sdk/crypto/CMakeLists.txt
index 9e9660289a..1896bd9b0c 100755
--- a/dap-sdk/crypto/CMakeLists.txt
+++ b/dap-sdk/crypto/CMakeLists.txt
@@ -64,13 +64,34 @@ if(WIN32)
 endif()
 
 if(UNIX)
-  if(BUILD_64)
-    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s )
-  else()
-    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
-  endif()
+    if (LINUX)
+	if(BUILD_64)
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s )
+        else()
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+        endif()
+    elseif(APPLE)
+	if(BUILD_64)
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas_Apple.s )
+	else()
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+	endif()
+    elseif(BSD)
+	if(BUILD_64)
+            file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Compact64/KeccakP-1600-compact64.c )
+	else()
+	    file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+        endif()
+    else()
+	file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c )
+    endif()
+
 endif()
 
+
+
+
+
 add_library(${PROJECT_NAME} STATIC ${CRYPTO_SRCS} ${XKCP_SRCS} ${XKCP_SRCS2} ${CRYPTO_HEADERS} )
 
 target_include_directories(dap_crypto PRIVATE src/seed src/rand src/iaes src/oaes src/sha3 src/msrln src/defeo_scheme src/sig_bliss src/sig_tesla src/sig_picnic src/sig_dilithium src include)
diff --git a/dap-sdk/crypto/include/dap_crypto_common.h b/dap-sdk/crypto/include/dap_crypto_common.h
index f213209536..67a3db6a94 100755
--- a/dap-sdk/crypto/include/dap_crypto_common.h
+++ b/dap-sdk/crypto/include/dap_crypto_common.h
@@ -34,11 +34,14 @@ extern "C" {
     #define OS_TARGET OS_LINUX
 #elif defined(__APPLE__)         // MACOS
     #define OS_TARGET OS_MACOS
+#elif defined (DAP_OS_BSD)
+    #define OS_TARGET_OS_BSD
 #else
     #error -- "Unsupported OS"
 #endif
 
 
+
 // Definition of compiler
 
 #define COMPILER_VC      1
diff --git a/dap-sdk/crypto/src/msrln/AMD64/consts.c b/dap-sdk/crypto/src/msrln/AMD64/consts.c
index 3ff24cbb00..9d45871ba4 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/consts.c
+++ b/dap-sdk/crypto/src/msrln/AMD64/consts.c
@@ -1,40 +1,40 @@
-/****************************************************************************************
-* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-*
-*    Copyright (c) Microsoft Corporation. All rights reserved.
-*
-*
-* Abstract: constants for the x64 assembly implementation
-*
-*****************************************************************************************/
-
-#include "../LatticeCrypto_priv.h"
-#include <stdint.h>
-
-
-uint32_t PRIME8x[8]      = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q};
-uint8_t ONE32x[32]       = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
-uint32_t MASK12x8[8]     = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff};
-uint32_t PERM0246[4]     = {0,2,4,6};
-uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6};
-uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7};
-uint64_t PERM0145[4]     = {0,1,4,5};
-uint64_t PERM2367[4]     = {2,3,6,7};
-uint64_t MASK32[4]       = {0xffffffff,0,0xffffffff,0};
-uint64_t MASK42[4]       = {0x3fff0000000,0,0x3fff0000000,0};
-
-uint64_t MASK14_1[4]     = {0x3fff,0,0x3fff,0};
-uint64_t MASK14_2[4]     = {0xFFFC000,0,0xFFFC000,0};
-uint64_t MASK14_3[4]     = {0x3FFF0000000,0,0x3FFF0000000,0};
-uint64_t MASK14_4[4]     = {0xFFFC0000000000,0,0xFFFC0000000000,0};
-
-uint32_t ONE8x[8]        = {1,1,1,1,1,1,1,1};
-uint32_t THREE8x[8]      = {3,3,3,3,3,3,3,3};
-uint32_t FOUR8x[8]       = {4,4,4,4,4,4,4,4};
-uint32_t PARAM_Q4x8[8]   = {3073,3073,3073,3073,3073,3073,3073,3073};
-uint32_t PARAM_3Q4x8[8]  = {9217,9217,9217,9217,9217,9217,9217,9217};
-uint32_t PARAM_5Q4x8[8]  = {15362,15362,15362,15362,15362,15362,15362,15362};
-uint32_t PARAM_7Q4x8[8]  = {21506,21506,21506,21506,21506,21506,21506,21506};
-uint32_t PARAM_Q2x8[8]   = {6145,6145,6145,6145,6145,6145,6145,6145};
-uint32_t PARAM_3Q2x8[8]  = {18434,18434,18434,18434,18434,18434,18434,18434};
-
+/****************************************************************************************
+* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+*
+*    Copyright (c) Microsoft Corporation. All rights reserved.
+*
+*
+* Abstract: constants for the x64 assembly implementation
+*
+*****************************************************************************************/
+
+#include "../LatticeCrypto_priv.h"
+#include <stdint.h>
+
+
+uint32_t PRIME8x[8]      = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q};
+uint8_t ONE32x[32]       = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+uint32_t MASK12x8[8]     = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff};
+uint32_t PERM0246[4]     = {0,2,4,6};
+uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6};
+uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7};
+uint64_t PERM0145[4]     = {0,1,4,5};
+uint64_t PERM2367[4]     = {2,3,6,7};
+uint64_t MASK32[4]       = {0xffffffff,0,0xffffffff,0};
+uint64_t MASK42[4]       = {0x3fff0000000,0,0x3fff0000000,0};
+
+uint64_t MASK14_1[4]     = {0x3fff,0,0x3fff,0};
+uint64_t MASK14_2[4]     = {0xFFFC000,0,0xFFFC000,0};
+uint64_t MASK14_3[4]     = {0x3FFF0000000,0,0x3FFF0000000,0};
+uint64_t MASK14_4[4]     = {0xFFFC0000000000,0,0xFFFC0000000000,0};
+
+uint32_t ONE8x[8]        = {1,1,1,1,1,1,1,1};
+uint32_t THREE8x[8]      = {3,3,3,3,3,3,3,3};
+uint32_t FOUR8x[8]       = {4,4,4,4,4,4,4,4};
+uint32_t PARAM_Q4x8[8]   = {3073,3073,3073,3073,3073,3073,3073,3073};
+uint32_t PARAM_3Q4x8[8]  = {9217,9217,9217,9217,9217,9217,9217,9217};
+uint32_t PARAM_5Q4x8[8]  = {15362,15362,15362,15362,15362,15362,15362,15362};
+uint32_t PARAM_7Q4x8[8]  = {21506,21506,21506,21506,21506,21506,21506,21506};
+uint32_t PARAM_Q2x8[8]   = {6145,6145,6145,6145,6145,6145,6145,6145};
+uint32_t PARAM_3Q2x8[8]  = {18434,18434,18434,18434,18434,18434,18434,18434};
+
diff --git a/dap-sdk/crypto/src/msrln/AMD64/error_asm.S b/dap-sdk/crypto/src/msrln/AMD64/error_asm.S
index 828816af04..836e47d8d7 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/error_asm.S
+++ b/dap-sdk/crypto/src/msrln/AMD64/error_asm.S
@@ -1,436 +1,436 @@
-//****************************************************************************************
-// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-//
-//    Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 
-//           vector instructions for Linux 
-//
-//****************************************************************************************  
-
-.intel_syntax noprefix 
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-#define reg_p4  rcx
-#define reg_p5  r8
-
-
-.text
-//***********************************************************************
-//  Error sampling from psi_12
-//  Operation: c [reg_p2] <- sampling(a) [reg_p1]
-//*********************************************************************** 
-.global error_sampling_asm
-error_sampling_asm:  
-  vmovdqu    ymm7, ONE32x 
-  movq       r11, 384
-  movq       r10, 32
-  movq       r8, 24
-  xor        rax, rax
-  xor        rcx, rcx
-loop1:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // sample
-  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+32]     // sample
-  vmovdqu    ymm4, YMMWORD PTR [reg_p1+4*rax+64]     // sample
-  movq       r9, 2
-
-loop1b:
-  vpand      ymm1, ymm0, ymm7                        // Collecting 8 bits for first sample
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm0, ymm0, 1 
-  vpand      ymm3, ymm0, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  
-  vpand      ymm3, ymm2, ymm7                        // Adding next 4 bits
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm1, ymm1, ymm3
-  
-  vpsrlw     ymm2, ymm2, 1                           // Collecting 4-bits for second sample
-  vpand      ymm5, ymm2, ymm7
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm2, ymm2, 1 
-  vpand      ymm3, ymm2, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  
-  vpand      ymm3, ymm4, ymm7                        // Adding next 8 bits
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-  vpsrlw     ymm4, ymm4, 1 
-  vpand      ymm3, ymm4, ymm7
-  vpaddb     ymm5, ymm5, ymm3
-
-  vpsubb     ymm5, ymm1, ymm5
-  vpermq     ymm3, ymm5, 0x0e 
-  vpmovsxbd  ymm6, xmm5
-  vpsrldq    ymm5, ymm5, 8 
-  vpmovsxbd  ymm7, xmm5 
-  vpmovsxbd  ymm8, xmm3
-  vpsrldq    ymm3, ymm3, 8 
-  vpmovsxbd  ymm9, xmm3
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx], ymm6
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+32], ymm7
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+64], ymm8
-  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+96], ymm9
-  
-  add        rcx, r10        // i+32
-  vpsrlw     ymm0, ymm0, 1 
-  vpsrlw     ymm2, ymm2, 1 
-  vpsrlw     ymm4, ymm4, 1 
-  dec        r9
-  jnz        loop1b
-        
-  add        rax, r8         // j+24        
-  cmp        rax, r11
-  jl         loop1
-  ret
-
-
-//***********************************************************************
-//  Reconciliation helper function
-//  Operation: c [reg_p2] <- function(a) [reg_p1]
-//             [reg_p3] points to random bits
-//*********************************************************************** 
-.global helprec_asm
-helprec_asm:  
-  vmovdqu    ymm8, ONE8x 
-  movq       r11, 256
-  movq       r10, 8
-  xor        rax, rax
-  vmovdqu    ymm4, YMMWORD PTR [reg_p3]              // rbits
-loop2:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
-  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
-  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
-
-  vpand      ymm5, ymm4, ymm8                        // Collecting 8 random bits
-  vpslld     ymm0, ymm0, 1                           // 2*x - rbits
-  vpslld     ymm1, ymm1, 1 
-  vpslld     ymm2, ymm2, 1 
-  vpslld     ymm3, ymm3, 1 
-  vpsubd     ymm0, ymm0, ymm5
-  vpsubd     ymm1, ymm1, ymm5
-  vpsubd     ymm2, ymm2, ymm5
-  vpsubd     ymm3, ymm3, ymm5
-    
-  vmovdqu    ymm15, PARAM_Q4x8 
-  vmovdqu    ymm7, FOUR8x
-  vmovdqu    ymm8, ymm7
-  vmovdqu    ymm9, ymm7
-  vmovdqu    ymm10, ymm7
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6
-  vmovdqu    ymm15, PARAM_3Q4x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6
-  vmovdqu    ymm15, PARAM_5Q4x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6
-  vmovdqu    ymm15, PARAM_7Q4x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm7, ymm7, ymm6                        // v0[0]
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm8, ymm8, ymm6                        // v0[1]
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm9, ymm9, ymm6                        // v0[2]
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm10, ymm10, ymm6                      // v0[3]  
-    
-  vmovdqu    ymm15, PARAM_Q2x8 
-  vmovdqu    ymm11, THREE8x
-  vmovdqu    ymm12, ymm11
-  vmovdqu    ymm13, ymm11
-  vmovdqu    ymm14, ymm11
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm11, ymm11, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm12, ymm12, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm13, ymm13, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm14, ymm14, ymm6
-  vmovdqu    ymm15, PARAM_3Q2x8 
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm11, ymm11, ymm6
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm12, ymm12, ymm6
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm13, ymm13, ymm6
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm14, ymm14, ymm6
-  vmovdqu    ymm15, PRIME8x  
-  vpsubd     ymm6, ymm0, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm11, ymm11, ymm6                      // v1[0]
-  vpsubd     ymm6, ymm1, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm12, ymm12, ymm6                      // v1[1]
-  vpsubd     ymm6, ymm2, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm13, ymm13, ymm6                      // v1[2]
-  vpsubd     ymm6, ymm3, ymm15
-  vpsrld     ymm6, ymm6, 31 
-  vpsubd     ymm14, ymm14, ymm6                      // v1[3]
-
-  vpmulld    ymm6, ymm7, ymm15 
-  vpslld     ymm0, ymm0, 1 
-  vpsubd     ymm0, ymm0, ymm6
-  vpabsd     ymm0, ymm0
-  vpmulld    ymm6, ymm8, ymm15 
-  vpslld     ymm1, ymm1, 1 
-  vpsubd     ymm1, ymm1, ymm6
-  vpabsd     ymm1, ymm1
-  vpaddd     ymm0, ymm0, ymm1
-  vpmulld    ymm6, ymm9, ymm15 
-  vpslld     ymm2, ymm2, 1 
-  vpsubd     ymm2, ymm2, ymm6
-  vpabsd     ymm2, ymm2
-  vpaddd     ymm0, ymm0, ymm2
-  vpmulld    ymm6, ymm10, ymm15 
-  vpslld     ymm3, ymm3, 1 
-  vpsubd     ymm3, ymm3, ymm6
-  vpabsd     ymm3, ymm3
-  vpaddd     ymm0, ymm0, ymm3                        // norm
-  vpsubd     ymm0, ymm0, ymm15
-  vpsrad     ymm0, ymm0, 31                          // If norm < q then norm = 0xff...ff, else norm = 0
-  
-  vpxor      ymm7, ymm7, ymm11                       // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i]
-  vpand      ymm7, ymm7, ymm0
-  vpxor      ymm7, ymm7, ymm11
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm8, ymm8, ymm0
-  vpxor      ymm8, ymm8, ymm12
-  vpxor      ymm9, ymm9, ymm13
-  vpand      ymm9, ymm9, ymm0
-  vpxor      ymm9, ymm9, ymm13
-  vpxor      ymm10, ymm10, ymm14
-  vpand      ymm10, ymm10, ymm0
-  vpxor      ymm10, ymm10, ymm14
-  
-  vmovdqu    ymm15, THREE8x
-  vmovdqu    ymm14, ONE8x
-  vpsubd     ymm7, ymm7, ymm10
-  vpand      ymm7, ymm7, ymm15
-  vpsubd     ymm8, ymm8, ymm10
-  vpand      ymm8, ymm8, ymm15
-  vpsubd     ymm9, ymm9, ymm10
-  vpand      ymm9, ymm9, ymm15 
-  vpslld     ymm10, ymm10, 1 
-  vpxor      ymm0, ymm0, ymm14
-  vpand      ymm0, ymm0, ymm14
-  vpaddd     ymm10, ymm0, ymm10
-  vpand      ymm10, ymm10, ymm15 
-  
-  vpsrld     ymm4, ymm4, 1 
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm7
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*256], ymm8
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*512], ymm9
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*768], ymm10
-
-  add        rax, r10             // j+8 
-  add        rcx, r9
-  cmp        rax, r11             
-  jl         loop2
-  ret
-
-
-//***********************************************************************
-//  Reconciliation function
-//  Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2])
-//*********************************************************************** 
-.global rec_asm
-rec_asm:  
-  vpxor      ymm12, ymm12, ymm12 
-  vmovdqu    ymm15, PRIME8x   
-  vpslld     ymm14, ymm15, 2                         // 4*Q  
-  vpslld     ymm13, ymm15, 3                         // 8*Q
-  vpsubd     ymm12, ymm12, ymm13                     // -8*Q
-  vpxor      ymm11, ymm12, ymm13                     // 8*Q ^ -8*Q
-  vmovdqu    ymm10, ONE8x 
-  movq       r11, 256
-  movq       r10, 8
-  xor        rax, rax
-  xor        rcx, rcx
-loop3:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
-  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
-  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
-  vmovdqu    ymm4, YMMWORD PTR [reg_p2+4*rax]        // rvec
-  vmovdqu    ymm5, YMMWORD PTR [reg_p2+4*rax+4*256]  // rvec+256
-  vmovdqu    ymm6, YMMWORD PTR [reg_p2+4*rax+4*512]  // rvec+512
-  vmovdqu    ymm7, YMMWORD PTR [reg_p2+4*rax+4*768]  // rvec+768
-  
-  vpslld     ymm8, ymm4, 1                           // 2*rvec + rvec
-  vpaddd     ymm4, ymm7, ymm8
-  vpslld     ymm8, ymm5, 1 
-  vpaddd     ymm5, ymm7, ymm8
-  vpslld     ymm8, ymm6, 1 
-  vpaddd     ymm6, ymm7, ymm8
-  vpmulld    ymm4, ymm4, ymm15
-  vpmulld    ymm5, ymm5, ymm15
-  vpmulld    ymm6, ymm6, ymm15
-  vpmulld    ymm7, ymm7, ymm15
-  vpslld     ymm0, ymm0, 3                           // 8*x
-  vpslld     ymm1, ymm1, 3 
-  vpslld     ymm2, ymm2, 3 
-  vpslld     ymm3, ymm3, 3 
-  vpsubd     ymm0, ymm0, ymm4                        // t[i]
-  vpsubd     ymm1, ymm1, ymm5
-  vpsubd     ymm2, ymm2, ymm6
-  vpsubd     ymm3, ymm3, ymm7
-  
-  vpsrad     ymm8, ymm0, 31                          // mask1
-  vpabsd     ymm4, ymm0
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm0, ymm0, ymm4
-  vpabsd     ymm0, ymm0  
-  vpsrad     ymm8, ymm1, 31                          // mask1
-  vpabsd     ymm4, ymm1
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm1, ymm1, ymm4
-  vpabsd     ymm1, ymm1
-  vpaddd     ymm0, ymm0, ymm1
-  vpsrad     ymm8, ymm2, 31                          // mask1
-  vpabsd     ymm4, ymm2
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm2, ymm2, ymm4
-  vpabsd     ymm2, ymm2
-  vpaddd     ymm0, ymm0, ymm2
-  vpsrad     ymm8, ymm3, 31                          // mask1
-  vpabsd     ymm4, ymm3
-  vpsubd     ymm4, ymm14, ymm4
-  vpsrad     ymm4, ymm4, 31                          // mask2                       
-  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
-  vpxor      ymm8, ymm8, ymm12
-  vpand      ymm4, ymm4, ymm8
-  vpaddd     ymm3, ymm3, ymm4
-  vpabsd     ymm3, ymm3
-  vpaddd     ymm0, ymm0, ymm3                        // norm
-
-  vpsubd     ymm0, ymm13, ymm0                       // If norm < PARAMETER_Q then result = 1, else result = 0
-  vpsrld     ymm0, ymm0, 31                            
-  vpxor      ymm0, ymm0, ymm10
-
-  vpsrlq     ymm1, ymm0, 31
-  vpor       ymm1, ymm0, ymm1 
-  vpsllq     ymm2, ymm1, 2
-  vpsrldq    ymm2, ymm2, 8
-  vpor       ymm1, ymm2, ymm1 
-  vpsllq     ymm2, ymm1, 4
-  vpermq     ymm2, ymm2, 0x56
-  vpor       ymm0, ymm1, ymm2 
-  vmovq      r9, xmm0
-  
-  mov        BYTE PTR [reg_p3+rcx], r9b
-
-  add        rax, r10             // j+8 
-  inc        rcx
-  cmp        rax, r11             
-  jl         loop3
-  ret
+//****************************************************************************************
+// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+//
+//    Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 
+//           vector instructions for Linux 
+//
+//****************************************************************************************  
+
+.intel_syntax noprefix 
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+#define reg_p4  rcx
+#define reg_p5  r8
+
+
+.text
+//***********************************************************************
+//  Error sampling from psi_12
+//  Operation: c [reg_p2] <- sampling(a) [reg_p1]
+//*********************************************************************** 
+.global error_sampling_asm
+error_sampling_asm:  
+  vmovdqu    ymm7, ONE32x 
+  movq       r11, 384
+  movq       r10, 32
+  movq       r8, 24
+  xor        rax, rax
+  xor        rcx, rcx
+loop1:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // sample
+  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+32]     // sample
+  vmovdqu    ymm4, YMMWORD PTR [reg_p1+4*rax+64]     // sample
+  movq       r9, 2
+
+loop1b:
+  vpand      ymm1, ymm0, ymm7                        // Collecting 8 bits for first sample
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm0, ymm0, 1 
+  vpand      ymm3, ymm0, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  
+  vpand      ymm3, ymm2, ymm7                        // Adding next 4 bits
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm1, ymm1, ymm3
+  
+  vpsrlw     ymm2, ymm2, 1                           // Collecting 4-bits for second sample
+  vpand      ymm5, ymm2, ymm7
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm2, ymm2, 1 
+  vpand      ymm3, ymm2, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  
+  vpand      ymm3, ymm4, ymm7                        // Adding next 8 bits
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+  vpsrlw     ymm4, ymm4, 1 
+  vpand      ymm3, ymm4, ymm7
+  vpaddb     ymm5, ymm5, ymm3
+
+  vpsubb     ymm5, ymm1, ymm5
+  vpermq     ymm3, ymm5, 0x0e 
+  vpmovsxbd  ymm6, xmm5
+  vpsrldq    ymm5, ymm5, 8 
+  vpmovsxbd  ymm7, xmm5 
+  vpmovsxbd  ymm8, xmm3
+  vpsrldq    ymm3, ymm3, 8 
+  vpmovsxbd  ymm9, xmm3
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx], ymm6
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+32], ymm7
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+64], ymm8
+  vmovdqu    YMMWORD PTR [reg_p2+4*rcx+96], ymm9
+  
+  add        rcx, r10        // i+32
+  vpsrlw     ymm0, ymm0, 1 
+  vpsrlw     ymm2, ymm2, 1 
+  vpsrlw     ymm4, ymm4, 1 
+  dec        r9
+  jnz        loop1b
+        
+  add        rax, r8         // j+24        
+  cmp        rax, r11
+  jl         loop1
+  ret
+
+
+//***********************************************************************
+//  Reconciliation helper function
+//  Operation: c [reg_p2] <- function(a) [reg_p1]
+//             [reg_p3] points to random bits
+//*********************************************************************** 
+.global helprec_asm
+helprec_asm:  
+  vmovdqu    ymm8, ONE8x 
+  movq       r11, 256
+  movq       r10, 8
+  xor        rax, rax
+  vmovdqu    ymm4, YMMWORD PTR [reg_p3]              // rbits
+loop2:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
+  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
+  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
+
+  vpand      ymm5, ymm4, ymm8                        // Collecting 8 random bits
+  vpslld     ymm0, ymm0, 1                           // 2*x - rbits
+  vpslld     ymm1, ymm1, 1 
+  vpslld     ymm2, ymm2, 1 
+  vpslld     ymm3, ymm3, 1 
+  vpsubd     ymm0, ymm0, ymm5
+  vpsubd     ymm1, ymm1, ymm5
+  vpsubd     ymm2, ymm2, ymm5
+  vpsubd     ymm3, ymm3, ymm5
+    
+  vmovdqu    ymm15, PARAM_Q4x8 
+  vmovdqu    ymm7, FOUR8x
+  vmovdqu    ymm8, ymm7
+  vmovdqu    ymm9, ymm7
+  vmovdqu    ymm10, ymm7
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6
+  vmovdqu    ymm15, PARAM_3Q4x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6
+  vmovdqu    ymm15, PARAM_5Q4x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6
+  vmovdqu    ymm15, PARAM_7Q4x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm7, ymm7, ymm6                        // v0[0]
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm8, ymm8, ymm6                        // v0[1]
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm9, ymm9, ymm6                        // v0[2]
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm10, ymm10, ymm6                      // v0[3]  
+    
+  vmovdqu    ymm15, PARAM_Q2x8 
+  vmovdqu    ymm11, THREE8x
+  vmovdqu    ymm12, ymm11
+  vmovdqu    ymm13, ymm11
+  vmovdqu    ymm14, ymm11
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm11, ymm11, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm12, ymm12, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm13, ymm13, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm14, ymm14, ymm6
+  vmovdqu    ymm15, PARAM_3Q2x8 
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm11, ymm11, ymm6
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm12, ymm12, ymm6
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm13, ymm13, ymm6
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm14, ymm14, ymm6
+  vmovdqu    ymm15, PRIME8x  
+  vpsubd     ymm6, ymm0, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm11, ymm11, ymm6                      // v1[0]
+  vpsubd     ymm6, ymm1, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm12, ymm12, ymm6                      // v1[1]
+  vpsubd     ymm6, ymm2, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm13, ymm13, ymm6                      // v1[2]
+  vpsubd     ymm6, ymm3, ymm15
+  vpsrld     ymm6, ymm6, 31 
+  vpsubd     ymm14, ymm14, ymm6                      // v1[3]
+
+  vpmulld    ymm6, ymm7, ymm15 
+  vpslld     ymm0, ymm0, 1 
+  vpsubd     ymm0, ymm0, ymm6
+  vpabsd     ymm0, ymm0
+  vpmulld    ymm6, ymm8, ymm15 
+  vpslld     ymm1, ymm1, 1 
+  vpsubd     ymm1, ymm1, ymm6
+  vpabsd     ymm1, ymm1
+  vpaddd     ymm0, ymm0, ymm1
+  vpmulld    ymm6, ymm9, ymm15 
+  vpslld     ymm2, ymm2, 1 
+  vpsubd     ymm2, ymm2, ymm6
+  vpabsd     ymm2, ymm2
+  vpaddd     ymm0, ymm0, ymm2
+  vpmulld    ymm6, ymm10, ymm15 
+  vpslld     ymm3, ymm3, 1 
+  vpsubd     ymm3, ymm3, ymm6
+  vpabsd     ymm3, ymm3
+  vpaddd     ymm0, ymm0, ymm3                        // norm
+  vpsubd     ymm0, ymm0, ymm15
+  vpsrad     ymm0, ymm0, 31                          // If norm < q then norm = 0xff...ff, else norm = 0
+  
+  vpxor      ymm7, ymm7, ymm11                       // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i]
+  vpand      ymm7, ymm7, ymm0
+  vpxor      ymm7, ymm7, ymm11
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm8, ymm8, ymm0
+  vpxor      ymm8, ymm8, ymm12
+  vpxor      ymm9, ymm9, ymm13
+  vpand      ymm9, ymm9, ymm0
+  vpxor      ymm9, ymm9, ymm13
+  vpxor      ymm10, ymm10, ymm14
+  vpand      ymm10, ymm10, ymm0
+  vpxor      ymm10, ymm10, ymm14
+  
+  vmovdqu    ymm15, THREE8x
+  vmovdqu    ymm14, ONE8x
+  vpsubd     ymm7, ymm7, ymm10
+  vpand      ymm7, ymm7, ymm15
+  vpsubd     ymm8, ymm8, ymm10
+  vpand      ymm8, ymm8, ymm15
+  vpsubd     ymm9, ymm9, ymm10
+  vpand      ymm9, ymm9, ymm15 
+  vpslld     ymm10, ymm10, 1 
+  vpxor      ymm0, ymm0, ymm14
+  vpand      ymm0, ymm0, ymm14
+  vpaddd     ymm10, ymm0, ymm10
+  vpand      ymm10, ymm10, ymm15 
+  
+  vpsrld     ymm4, ymm4, 1 
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm7
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*256], ymm8
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*512], ymm9
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax+4*768], ymm10
+
+  add        rax, r10             // j+8 
+  add        rcx, r9
+  cmp        rax, r11             
+  jl         loop2
+  ret
+
+
+//***********************************************************************
+//  Reconciliation function
+//  Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2])
+//*********************************************************************** 
+.global rec_asm
+rec_asm:  
+  vpxor      ymm12, ymm12, ymm12 
+  vmovdqu    ymm15, PRIME8x   
+  vpslld     ymm14, ymm15, 2                         // 4*Q  
+  vpslld     ymm13, ymm15, 3                         // 8*Q
+  vpsubd     ymm12, ymm12, ymm13                     // -8*Q
+  vpxor      ymm11, ymm12, ymm13                     // 8*Q ^ -8*Q
+  vmovdqu    ymm10, ONE8x 
+  movq       r11, 256
+  movq       r10, 8
+  xor        rax, rax
+  xor        rcx, rcx
+loop3:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]        // x
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*rax+4*256]  // x+256
+  vmovdqu    ymm2, YMMWORD PTR [reg_p1+4*rax+4*512]  // x+512
+  vmovdqu    ymm3, YMMWORD PTR [reg_p1+4*rax+4*768]  // x+768
+  vmovdqu    ymm4, YMMWORD PTR [reg_p2+4*rax]        // rvec
+  vmovdqu    ymm5, YMMWORD PTR [reg_p2+4*rax+4*256]  // rvec+256
+  vmovdqu    ymm6, YMMWORD PTR [reg_p2+4*rax+4*512]  // rvec+512
+  vmovdqu    ymm7, YMMWORD PTR [reg_p2+4*rax+4*768]  // rvec+768
+  
+  vpslld     ymm8, ymm4, 1                           // 2*rvec + rvec
+  vpaddd     ymm4, ymm7, ymm8
+  vpslld     ymm8, ymm5, 1 
+  vpaddd     ymm5, ymm7, ymm8
+  vpslld     ymm8, ymm6, 1 
+  vpaddd     ymm6, ymm7, ymm8
+  vpmulld    ymm4, ymm4, ymm15
+  vpmulld    ymm5, ymm5, ymm15
+  vpmulld    ymm6, ymm6, ymm15
+  vpmulld    ymm7, ymm7, ymm15
+  vpslld     ymm0, ymm0, 3                           // 8*x
+  vpslld     ymm1, ymm1, 3 
+  vpslld     ymm2, ymm2, 3 
+  vpslld     ymm3, ymm3, 3 
+  vpsubd     ymm0, ymm0, ymm4                        // t[i]
+  vpsubd     ymm1, ymm1, ymm5
+  vpsubd     ymm2, ymm2, ymm6
+  vpsubd     ymm3, ymm3, ymm7
+  
+  vpsrad     ymm8, ymm0, 31                          // mask1
+  vpabsd     ymm4, ymm0
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm0, ymm0, ymm4
+  vpabsd     ymm0, ymm0  
+  vpsrad     ymm8, ymm1, 31                          // mask1
+  vpabsd     ymm4, ymm1
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm1, ymm1, ymm4
+  vpabsd     ymm1, ymm1
+  vpaddd     ymm0, ymm0, ymm1
+  vpsrad     ymm8, ymm2, 31                          // mask1
+  vpabsd     ymm4, ymm2
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm2, ymm2, ymm4
+  vpabsd     ymm2, ymm2
+  vpaddd     ymm0, ymm0, ymm2
+  vpsrad     ymm8, ymm3, 31                          // mask1
+  vpabsd     ymm4, ymm3
+  vpsubd     ymm4, ymm14, ymm4
+  vpsrad     ymm4, ymm4, 31                          // mask2                       
+  vpand      ymm8, ymm8, ymm11                       // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q
+  vpxor      ymm8, ymm8, ymm12
+  vpand      ymm4, ymm4, ymm8
+  vpaddd     ymm3, ymm3, ymm4
+  vpabsd     ymm3, ymm3
+  vpaddd     ymm0, ymm0, ymm3                        // norm
+
+  vpsubd     ymm0, ymm13, ymm0                       // If norm < PARAMETER_Q then result = 1, else result = 0
+  vpsrld     ymm0, ymm0, 31                            
+  vpxor      ymm0, ymm0, ymm10
+
+  vpsrlq     ymm1, ymm0, 31
+  vpor       ymm1, ymm0, ymm1 
+  vpsllq     ymm2, ymm1, 2
+  vpsrldq    ymm2, ymm2, 8
+  vpor       ymm1, ymm2, ymm1 
+  vpsllq     ymm2, ymm1, 4
+  vpermq     ymm2, ymm2, 0x56
+  vpor       ymm0, ymm1, ymm2 
+  vmovq      r9, xmm0
+  
+  mov        BYTE PTR [reg_p3+rcx], r9b
+
+  add        rax, r10             // j+8 
+  inc        rcx
+  cmp        rax, r11             
+  jl         loop3
+  ret
diff --git a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c
index ef846a4841..d39e95e779 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c
+++ b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c
@@ -1,65 +1,65 @@
-/****************************************************************************************
-* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-*
-*    Copyright (c) Microsoft Corporation. All rights reserved.
-*
-*
-* Abstract: NTT functions and other low-level operations
-*
-*****************************************************************************************/
-
-#include "../LatticeCrypto_priv.h"
-    
-
-void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N)
-{
-    NTT_CT_std2rev_12289_asm(a, psi_rev, N);
-}
-
-
-void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N)
-{
-    INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N);
-}
-
-
-void two_reduce12289(int32_t* a, unsigned int N)
-{
-    two_reduce12289_asm(a, N);
-}
-
-
-void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N)
-{
-    pmul_asm(a, b, c, N);
-}
-
-
-void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N)
-{
-    pmuladd_asm(a, b, c, d, N);
-}
-
-
-void smul(int32_t* a, int32_t scalar, unsigned int N)
-{
-    unsigned int i; 
-
-    for (i = 0; i < N; i++) {
-        a[i] = a[i]*scalar;
-    }
-}
-
-
-void correction(int32_t* a, int32_t p, unsigned int N)
-{  
-    unsigned int i; 
-    int32_t mask;
-
-    for (i = 0; i < N; i++) {
-        mask = a[i] >> (4*sizeof(int32_t) - 1);
-        a[i] += (p & mask) - p;
-        mask = a[i] >> (4*sizeof(int32_t) - 1);
-        a[i] += (p & mask);
-    }
-}
+/****************************************************************************************
+* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+*
+*    Copyright (c) Microsoft Corporation. All rights reserved.
+*
+*
+* Abstract: NTT functions and other low-level operations
+*
+*****************************************************************************************/
+
+#include "../LatticeCrypto_priv.h"
+    
+
+void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N)
+{
+    NTT_CT_std2rev_12289_asm(a, psi_rev, N);
+}
+
+
+void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N)
+{
+    INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N);
+}
+
+
+void two_reduce12289(int32_t* a, unsigned int N)
+{
+    two_reduce12289_asm(a, N);
+}
+
+
+void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N)
+{
+    pmul_asm(a, b, c, N);
+}
+
+
+void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N)
+{
+    pmuladd_asm(a, b, c, d, N);
+}
+
+
+void smul(int32_t* a, int32_t scalar, unsigned int N)
+{
+    unsigned int i; 
+
+    for (i = 0; i < N; i++) {
+        a[i] = a[i]*scalar;
+    }
+}
+
+
+void correction(int32_t* a, int32_t p, unsigned int N)
+{  
+    unsigned int i; 
+    int32_t mask;
+
+    for (i = 0; i < N; i++) {
+        mask = a[i] >> (4*sizeof(int32_t) - 1);
+        a[i] += (p & mask) - p;
+        mask = a[i] >> (4*sizeof(int32_t) - 1);
+        a[i] += (p & mask);
+    }
+}
diff --git a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S
index e44c90dce0..9e8d89660a 100755
--- a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S
+++ b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S
@@ -1,979 +1,979 @@
-//****************************************************************************************
-// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
-//
-//    Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux 
-//
-//****************************************************************************************  
-
-.intel_syntax noprefix 
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-#define reg_p4  rcx
-#define reg_p5  r8
-
-
-.text
-//***********************************************************************
-//  Forward NTT
-//  Operation: a [reg_p1] <- NTT(a) [reg_p1], 
-//             [reg_p2] points to table and 
-//             reg_p3 contains parameter n
-//*********************************************************************** 
-.global NTT_CT_std2rev_12289_asm
-NTT_CT_std2rev_12289_asm:
-  push       r12
-  push       r13
-  push       r14
-
-// Stages m=1 -> m=32
-  mov        r9, 1            // m = 1
-  mov        rax, reg_p3 
-  mov        r12, reg_p3      
-  shr        r12, 4           // n/16
-  vmovdqu    ymm14, MASK12x8
-  vmovdqu    ymm12, PERM0246
-  mov        r14, 16
-  mov        rcx, 11
-loop1:
-  shr        rax, 1           // k = k/2
-  dec        rcx 
-  xor        rdx, rdx         // i = 0
-loop2:
-  mov        r10, rdx
-  mov        r11, rax
-  dec        r11
-  shl        r10, cl          // j1
-  add        r11, r10         // j2
-  mov        r13, r9
-  add        r13, rdx         // m+i
-  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13]   // S
-
-loop3:
-  mov        r13, r10
-  add        r13, rax         // j+k
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r13]    // a[j+k]
-  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k]
-  vpmovsxdq  ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k]
-  vpmovsxdq  ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k]
-  
-  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
-  vpmuldq    ymm3, ymm3, ymm11                   
-  vpmuldq    ymm5, ymm5, ymm11                   
-  vpmuldq    ymm7, ymm7, ymm11   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm1, 1                      // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V   
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm0, ymm12, ymm0 
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm3, 1                      // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm3, ymm2, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm2, ymm2, ymm13                   // a[j] = U + V  
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
-  vpermd     ymm3, ymm12, ymm3 
-  vpermd     ymm2, ymm12, ymm2 
-  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]
-
-  vmovdqu    ymm13, ymm5
-  vpand      ymm5, ymm14, ymm5                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm5, 1                      // 2*c0
-  vpsubd     ymm13, ymm5, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm5, ymm4, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm4, ymm4, ymm13                   // a[j] = U + V  
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
-  vpermd     ymm5, ymm12, ymm5 
-  vpermd     ymm4, ymm12, ymm4 
-  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j]
-
-  vmovdqu    ymm13, ymm7
-  vpand      ymm7, ymm14, ymm7                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm7, 1                      // 2*c0
-  vpsubd     ymm13, ymm7, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
-  vpsubd     ymm7, ymm6, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm6, ymm6, ymm13                   // a[j] = U + V 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
-  vpermd     ymm6, ymm12, ymm6   
-  vpermd     ymm7, ymm12, ymm7 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
-  
-  add        r10, r14
-  cmp        r10, r11
-  jl         loop3
-  inc        rdx
-  cmp        rdx, r9
-  jl         loop2
-  shl        r9, 1
-  cmp        r9, r12
-  jl         loop1
-   
-// Stage m=64
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-loop4:
-  vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k]
-  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
-  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
-  vpmuldq    ymm3, ymm3, ymm11                   // a[j+k].S
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                   // c0
-  vpsrlq     ymm13, ymm13, 12                    // c1
-  vpslld     ymm15, ymm1, 1                      // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
-  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1 
-  
-  vmovdqu    ymm10, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm10, ymm10, 12                    // c1
-  vpslld     ymm15, ymm3, 1                      // 2*c0
-  vpsubd     ymm10, ymm3, ymm10                  // c0-c1
-  vpaddd     ymm10, ymm10, ymm15                 // V = 3*c0-c1    
-  
-  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V    
-  vpsubd     ymm3, ymm2, ymm10                   // a[j+k] = U - V
-  vpaddd     ymm2, ymm2, ymm10                   // a[j] = U + V 
-  
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm2, ymm12, ymm2 
-  vpermd     ymm3, ymm12, ymm3 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
-  
-  add        r10, r14        // j+16 
-  inc        rdx             // i+1
-  cmp        rdx, r9
-  jl         loop4
-   
-// Stage m=128
-  shl        r9, 1
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-  mov        r13, 8 
-loop6:
-  vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-  vpmuldq    ymm1, ymm1, ymm2                    // a[j+k].S
-  
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm14, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                      // c1
-  vpslld     ymm4, ymm0, 1                       // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                    // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                    // U = 3*c0-c1    
-  
-  vmovdqu    ymm3, ymm1
-  vpand      ymm1, ymm14, ymm1                   // c0
-  vpsrlq     ymm4, ymm3, 24                      // c2
-  vpsrad     ymm3, ymm3, 12                      // xc1
-  vpand      ymm3, ymm14, ymm3                   // c1
-  vpslld     ymm5, ymm1, 3                       // 8*c0
-  vpaddd     ymm4, ymm1, ymm4                    // c0+c2
-  vpaddd     ymm4, ymm4, ymm5                    // 9*c0+c2
-  vpslld     ymm5, ymm3, 1                       // 2*c1
-  vpaddd     ymm1, ymm0, ymm3                    // U+c1
-  vpsubd     ymm0, ymm0, ymm3                    // U-c1
-  vpsubd     ymm4, ymm4, ymm5                    // 9*c0-2*c1+c2
-  vpaddd     ymm0, ymm0, ymm4                    // U+(9*c0-3*c1+c2)
-  vpsubd     ymm1, ymm1, ymm4                    // U-(9*c0-3*c1+c2)
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
-
-  add        r10, r13        // j+8
-  inc        rdx             // i+1
-  cmp        rdx, r9
-  jl         loop6
-
-// Stage m=256 
-  vmovdqu    ymm9, PERM02134657  
-  shl        r9, 1
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-  mov        r14, 32
-loop7:
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256]    // S = psi[m+i]->psi[m+i+3]
-  vpermq     ymm8, ymm2, 0x50   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-  
-  vpermq     ymm8, ymm2, 0xfa   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
-
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16]  // S = psi[m+i]->psi[m+i+3] 
-  vpermq     ymm8, ymm2, 0x50   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
-          
-  vpermq     ymm8, ymm2, 0xfa   
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+96]  // U = a[j]->a[j+3]
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3]
-  vpermq     ymm3, ymm0, 0x4e    
-  vinserti128 ymm0, ymm0, xmm1, 1                // U
-  vpblendd   ymm1, ymm1, ymm3, 15
-  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
-         
-  add        r10, r14        // j+32
-  add        rdx, r13        // i+8
-  cmp        rdx, r9
-  jl         loop7
-
-// Stage m=512
-  vmovdqu    ymm9, PERM00224466
-  shl        r9, 1            // m = n/2 
-  xor        rdx, rdx         // i = 0
-  xor        r10, r10         // j1 = 0
-  mov        r14, 4
-loop8:
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]    // U = a[j]
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]  // a[j+k]
-  vpmuldq    ymm3, ymm1, ymm2                    // a[j+k].S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                   // c0
-  vpsrlq     ymm4, ymm4, 12                      // c1
-  vpslld     ymm5, ymm3, 1                       // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
-  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
-  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
-  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
-  vpermd     ymm1, ymm9, ymm1 
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-  
-  add        r10, r13        // j+8
-  add        rdx, r14        // i+4
-  cmp        rdx, r9
-  jl         loop8
-
-  pop        r14
-  pop        r13
-  pop        r12
-  ret
-
-
-//***********************************************************************
-//  Inverse NTT
-//  Operation: a [reg_p1] <- INTT(a) [reg_p1], 
-//             [reg_p2] points to table
-//             reg_p3 and reg_p4 point to constants for scaling and
-//             reg_p5 contains parameter n
-//*********************************************************************** 
-.global INTT_GS_rev2std_12289_asm
-INTT_GS_rev2std_12289_asm:
-  push       r12
-  push       r13
-  push       r14
-  push       r15
-  push       rbx
-
-// Stage m=1024
-  vmovdqu    ymm9, PERM00224466
-  vmovdqu    ymm14, MASK12x8  
-  mov        r12, reg_p5           
-  shr        r12, 1          // n/2 = 512
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-  mov        r13, 8
-  mov        r14, 4
-loop1b:
-  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]       // V = a[j+k]    
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*512]   // S
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
-  vpermd     ymm1, ymm9, ymm1 
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-
-  add        r10, r13        // j+8
-  add        r15, r14        // i+4
-  cmp        r15, r12
-  jl         loop1b
-  
-// Stage m=512 
-  vmovdqu    ymm9, PERM02134657
-  vmovdqu    ymm13, PERM0145
-  vmovdqu    ymm15, PERM2367   
-  shr        r12, 1          // n/4 = 256
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-  mov        r14, 32
-loop2b:
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256]   // S = psi[m+i]->psi[m+i+3]
-  vpermq     ymm8, ymm2, 0x50   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0 
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
-  
-  vpermq     ymm8, ymm2, 0xfa   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
-
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] 
-  vpermq     ymm8, ymm2, 0x50   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+64]      // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
-         
-  vpermq     ymm8, ymm2, 0xfa   
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+96]      // U = a[j]->a[j+7]
-  vpermd     ymm1, ymm15, ymm0 
-  vpermd     ymm0, ymm13, ymm0  
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
-  vpslldq    ymm1, ymm1, 4    
-  vpblendd   ymm0, ymm0, ymm1, 0xaa
-  vpermd     ymm0, ymm9, ymm0
-  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
-         
-  add        r10, r14        // j+32
-  add        r15, r13        // i+8
-  cmp        r15, r12
-  jl         loop2b
-     
-// Stage m=256 
-  vmovdqu    ymm12, PERM0246   
-  shr        r12, 1          // n/8 = 128
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-loop3b:
-  vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128]   // S
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16]      // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpsubd     ymm3, ymm0, ymm1                         // U - V
-  vpaddd     ymm0, ymm0, ymm1                         // U + V 
-  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
-  vmovdqu    ymm4, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm4, ymm4, 12                           // c1
-  vpslld     ymm5, ymm3, 1                            // 2*c0
-  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
-  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
-  
-  add        r10, r13        // j+8
-  inc        r15             // i+1
-  cmp        r15, r12
-  jl         loop3b
-     
-// Stage m=128
-  shr        r12, 1          // n/16 = 64
-  xor        r15, r15        // i = 0
-  xor        r10, r10        // j1 = 0
-  mov        r14, 16 
-loop4b:
-  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64]   // S
-  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+32]     // V = a[j+k]
-  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r10+48]     // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
-  vpsubd     ymm1, ymm0, ymm13                        // U - V
-  vpaddd     ymm0, ymm0, ymm13                        // U + V 
-  vpsubd     ymm3, ymm2, ymm15                        // U - V
-  vpaddd     ymm2, ymm2, ymm15                        // U + V   
-  vpmuldq    ymm1, ymm1, ymm11                        // (U - V).S
-  vpmuldq    ymm3, ymm3, ymm11                        // (U - V).S
-  
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1    
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm3, 1                           // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
-  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
-  
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm2, ymm12, ymm2 
-  vpermd     ymm3, ymm12, ymm3 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
-  
-  add        r10, r14        // j+16 
-  inc        r15             // i+1
-  cmp        r15, r12
-  jl         loop4b
-  
-// Stages m=64 -> m=4  
-  mov        r9, 5            // 5 iterations
-  mov        rax, 8 
-loop5b:
-  shl        rax, 1          // k = 2*k
-  shr        r12, 1          // m/2
-  xor        r15, r15        // i = 0
-  xor        r8, r8        
-loop6b:
-  mov        r10, r8         // Load j1
-  mov        r11, rax
-  dec        r11
-  add        r11, r10        // j2
-  mov        r13, r12
-  add        r13, r15        // m/2+i
-  vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13]         // S
-  mov        rbx, 4
-
-loop7b:
-  mov        r13, r10
-  add        r13, rax         // j+k
-  vpmovsxdq  ymm10, XMMWORD PTR [reg_p1+4*r13]        // V = a[j+k]
-  vpmovsxdq  ymm11, XMMWORD PTR [reg_p1+4*r13+16]     // V = a[j+k]
-  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r13+32]     // V = a[j+k]
-  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r13+48]     // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
-  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]
-  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48]      // U = a[j]
-  
-  vpsubd     ymm1, ymm0, ymm10                        // U - V
-  vpaddd     ymm0, ymm0, ymm10                        // U + V 
-  vpsubd     ymm3, ymm2, ymm11                        // U - V
-  vpaddd     ymm2, ymm2, ymm11                        // U + V 
-  vpsubd     ymm5, ymm4, ymm13                        // U - V
-  vpaddd     ymm4, ymm4, ymm13                        // U + V 
-  vpsubd     ymm7, ymm6, ymm15                        // U - V
-  vpaddd     ymm6, ymm6, ymm15                        // U + V 
-
-  vpmuldq    ymm1, ymm1, ymm9                         // (U - V).S
-  vpmuldq    ymm3, ymm3, ymm9                   
-  vpmuldq    ymm5, ymm5, ymm9                   
-  vpmuldq    ymm7, ymm7, ymm9   
-  
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
-
-  cmp        r9, rbx 
-  jne        skip1
-  vmovdqu    ymm13, ymm0
-  vpand      ymm0, ymm14, ymm0                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm0, 1                           // 2*c0
-  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
-  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1
-skip1:
-  vpermd     ymm1, ymm12, ymm1 
-  vpermd     ymm0, ymm12, ymm0 
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm3, 1                           // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
-  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
-
-  cmp        r9, rbx 
-  jne        skip2
-  vmovdqu    ymm13, ymm2
-  vpand      ymm2, ymm14, ymm2                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm2, 1                           // 2*c0
-  vpsubd     ymm13, ymm2, ymm13                       // c0-c1
-  vpaddd     ymm2, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm3
-  vpand      ymm3, ymm14, ymm3                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm3, 1                           // 2*c0
-  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
-  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1
-skip2:
-  vpermd     ymm3, ymm12, ymm3 
-  vpermd     ymm2, ymm12, ymm2 
-
-  vmovdqu    ymm13, ymm5
-  vpand      ymm5, ymm14, ymm5                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm5, 1                           // 2*c0
-  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
-  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
-
-  cmp        r9, rbx 
-  jne        skip3
-  vmovdqu    ymm13, ymm4
-  vpand      ymm4, ymm14, ymm4                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm4, 1                           // 2*c0
-  vpsubd     ymm13, ymm4, ymm13                       // c0-c1
-  vpaddd     ymm4, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm5
-  vpand      ymm5, ymm14, ymm5                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm5, 1                           // 2*c0
-  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
-  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1
-skip3:
-  vpermd     ymm5, ymm12, ymm5 
-  vpermd     ymm4, ymm12, ymm4 
-
-  vmovdqu    ymm13, ymm7
-  vpand      ymm7, ymm14, ymm7                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm7, 1                           // 2*c0
-  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
-  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
-
-  cmp        r9, rbx 
-  jne        skip4
-  vmovdqu    ymm13, ymm6
-  vpand      ymm6, ymm14, ymm6                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1       
-  vpslld     ymm15, ymm6, 1                           // 2*c0
-  vpsubd     ymm13, ymm6, ymm13                       // c0-c1
-  vpaddd     ymm6, ymm13, ymm15                       // 3*c0-c1
-
-  vmovdqu    ymm13, ymm7
-  vpand      ymm7, ymm14, ymm7                        // c0
-  vpsrad     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm7, 1                           // 2*c0
-  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
-  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1
-skip4:
-  vpermd     ymm7, ymm12, ymm7 
-  vpermd     ymm6, ymm12, ymm6   
-  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
-  
-  add        r10, r14
-  cmp        r10, r11
-  jl         loop7b
-  mov        rbx, rax
-  shl        rbx, 1          // 2*k
-  add        r8, rbx         // j1+2*k
-  inc        r15
-  cmp        r15, r12
-  jl         loop6b
-  dec        r9
-  jnz        loop5b
-       
-// Scaling step
-  shl        rax, 1          // k = 2*k = 512
-  xor        r10, r10        // j = 0
-  mov        r14, 4 
-  movq       xmm0, reg_p3
-  vbroadcastsd ymm10, xmm0                            // S = omegainv1N_rev
-  movq       xmm0, reg_p4
-  vbroadcastsd ymm11, xmm0                            // T = Ninv
-loop8b:
-  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+4*512]  // V = a[j+k]
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
-  vpsubd     ymm1, ymm0, ymm13                        // U - V
-  vpaddd     ymm0, ymm0, ymm13                        // U + V  
-  vpmuldq    ymm1, ymm1, ymm10                        // (U - V).S
-  vpmuldq    ymm0, ymm0, ymm11                        // (U + V).T
-  
-  vmovdqu    ymm13, ymm0
-  vpand      ymm0, ymm14, ymm0                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm0, 1                           // 2*c0
-  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
-  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1    
-
-  vmovdqu    ymm13, ymm1
-  vpand      ymm1, ymm14, ymm1                        // c0
-  vpsrlq     ymm13, ymm13, 12                         // c1
-  vpslld     ymm15, ymm1, 1                           // 2*c0
-  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
-  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
-  
-  vpermd     ymm0, ymm12, ymm0 
-  vpermd     ymm1, ymm12, ymm1 
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p1+4*r10+4*512], xmm1
-  
-  add        r10, r14        // j+4 
-  cmp        r10, rax
-  jl         loop8b  
-loop9b:
-  pop        rbx
-  pop        r15
-  pop        r14
-  pop        r13
-  pop        r12
-  ret
-
-
-//***********************************************************************
-//  Component-wise multiplication and addition
-//  Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3]
-//             reg_p5 contains parameter n
-//*********************************************************************** 
-.global pmuladd_asm
-pmuladd_asm:
-  vmovdqu    ymm5, PERM0246
-  vmovdqu    ymm6, MASK12x8 
-  xor        rax, rax
-  movq       r11, 4
-lazo2:
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
-  vpmovsxdq  ymm2, XMMWORD PTR [reg_p3+4*rax]   // c
-  vpmuldq    ymm0, ymm1, ymm0 
-  vpaddq     ymm0, ymm2, ymm0                    
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrlq     ymm3, ymm3, 12                     // c1
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1       
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
-
-  vpermd     ymm0, ymm5, ymm0 
-  vmovdqu    XMMWORD PTR [reg_p4+4*rax], xmm0
-
-  add        rax, r11                           // j+4
-  cmp        rax, reg_p5
-  jl         lazo2
-  ret
-
-
-//***********************************************************************
-//  Component-wise multiplication
-//  Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2]
-//             reg_p4 contains parameter n
-//*********************************************************************** 
-.global pmul_asm
-pmul_asm: 
-  vmovdqu    ymm5, PERM0246
-  vmovdqu    ymm6, MASK12x8 
-  xor        rax, rax
-  movq       r11, 4
-lazo3:
-  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
-  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
-  vpmuldq    ymm0, ymm1, ymm0                    
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrlq     ymm3, ymm3, 12                     // c1
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1       
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
-
-  vpermd     ymm0, ymm5, ymm0 
-  vmovdqu    XMMWORD PTR [reg_p3+4*rax], xmm0
-
-  add        rax, r11                           // j+4
-  cmp        rax, reg_p4
-  jl         lazo3
-  ret
-
-
-//***********************************************************************
-//  Two consecutive reductions
-//  Operation: c [reg_p1] <- a [reg_p1]
-//             reg_p2 contains parameter n
-//*********************************************************************** 
-.global two_reduce12289_asm
-two_reduce12289_asm: 
-  vmovdqu    ymm6, MASK12x8 
-  vmovdqu    ymm7, PRIME8x
-  xor        rax, rax
-  movq       r11, 8
-lazo4:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
-
-  vmovdqu    ymm3, ymm0
-  vpand      ymm0, ymm6, ymm0                   // c0
-  vpsrad     ymm3, ymm3, 12                     // c1       
-  vpslld     ymm4, ymm0, 1                      // 2*c0
-  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
-  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
-
-  vpsrad     ymm2, ymm0, 31
-  vpand      ymm2, ymm7, ymm2
-  vpaddd     ymm2, ymm0, ymm2
-  vpsubd     ymm0, ymm2, ymm7
-
-  vpsrad     ymm2, ymm0, 31
-  vpand      ymm2, ymm7, ymm2
-  vpaddd     ymm0, ymm0, ymm2
-
-  vmovdqu    YMMWORD PTR [reg_p1+4*rax], ymm0
-
-  add        rax, r11                           // j+8
-  cmp        rax, reg_p2
-  jl         lazo4
-  ret
-
-
-//***********************************************************************
-//  Encoding
-//  Operation: c [reg_p2] <- a [reg_p1]
-//*********************************************************************** 
-.global encode_asm
-encode_asm: 
-  vmovdqu    ymm6, MASK32 
-  vmovdqu    ymm7, MASK42
-  mov        r9, 1024
-  xor        rax, rax
-  xor        r10, r10
-  mov        r11, 14
-  mov        rcx, 8
-lazo5:
-  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
-
-  vpsrlq     ymm1, ymm0, 18  
-  vpsllq     ymm2, ymm0, 4
-  vpand      ymm0, ymm0, ymm6
-  vpsrldq    ymm2, ymm2, 5   
-  vpsrlq     ymm3, ymm1, 4
-  vpand      ymm1, ymm1, ymm6
-  vpand      ymm2, ymm2, ymm7
-  vpsrldq    ymm3, ymm3, 4 
-  vpor       ymm0, ymm0, ymm1
-  vpor       ymm0, ymm0, ymm2 
-  vpor       ymm0, ymm0, ymm3 
-  vpermq     ymm1, ymm0, 0x0e   
-
-  vmovdqu    XMMWORD PTR [reg_p2+r10], xmm0
-  vmovdqu    XMMWORD PTR [reg_p2+r10+7], xmm1
-
-  add        r10, r11
-  add        rax, rcx        // j+8
-  cmp        rax, r9
-  jl         lazo5
-  ret
-
-
-//***********************************************************************
-//  Decoding
-//  Operation: c [reg_p2] <- a [reg_p1]
-//*********************************************************************** 
-.global decode_asm
-decode_asm: 
-  vmovdqu    ymm6, MASK14_1 
-  vmovdqu    ymm7, MASK14_2
-  vmovdqu    ymm8, MASK14_3
-  vmovdqu    ymm9, MASK14_4
-  mov        r9, 1024
-  xor        rax, rax
-  xor        r10, r10
-  mov        r11, 14
-  mov        rcx, 8
-lazo6:
-  vmovdqu    xmm0, XMMWORD PTR [reg_p1+r10]
-  vmovdqu    xmm1, XMMWORD PTR [reg_p1+r10+7]
-  vinserti128 ymm0, ymm0, xmm1, 1               
-
-  vpand      ymm1, ymm0, ymm6
-  vpand      ymm2, ymm0, ymm7
-  vpand      ymm3, ymm0, ymm8
-  vpand      ymm4, ymm0, ymm9
-   
-  vpsllq     ymm2, ymm2, 18 
-  vpsllq     ymm3, ymm3, 4
-  vpslldq    ymm3, ymm3, 4 
-  vpsrlq     ymm4, ymm4, 2
-  vpslldq    ymm4, ymm4, 7
-
-  vpor       ymm1, ymm1, ymm2 
-  vpor       ymm1, ymm1, ymm3 
-  vpor       ymm1, ymm1, ymm4 
-  
-  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm1   
-
-  add        r10, r11
-  add        rax, rcx            // j+8
-  cmp        rax, r9
-  jl         lazo6
+//****************************************************************************************
+// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
+//
+//    Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux 
+//
+//****************************************************************************************  
+
+.intel_syntax noprefix 
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+#define reg_p4  rcx
+#define reg_p5  r8
+
+
+.text
+//***********************************************************************
+//  Forward NTT
+//  Operation: a [reg_p1] <- NTT(a) [reg_p1], 
+//             [reg_p2] points to table and 
+//             reg_p3 contains parameter n
+//*********************************************************************** 
+.global NTT_CT_std2rev_12289_asm
+NTT_CT_std2rev_12289_asm:
+  push       r12
+  push       r13
+  push       r14
+
+// Stages m=1 -> m=32
+  mov        r9, 1            // m = 1
+  mov        rax, reg_p3 
+  mov        r12, reg_p3      
+  shr        r12, 4           // n/16
+  vmovdqu    ymm14, MASK12x8
+  vmovdqu    ymm12, PERM0246
+  mov        r14, 16
+  mov        rcx, 11
+loop1:
+  shr        rax, 1           // k = k/2
+  dec        rcx 
+  xor        rdx, rdx         // i = 0
+loop2:
+  mov        r10, rdx
+  mov        r11, rax
+  dec        r11
+  shl        r10, cl          // j1
+  add        r11, r10         // j2
+  mov        r13, r9
+  add        r13, rdx         // m+i
+  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13]   // S
+
+loop3:
+  mov        r13, r10
+  add        r13, rax         // j+k
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r13]    // a[j+k]
+  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k]
+  vpmovsxdq  ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k]
+  vpmovsxdq  ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k]
+  
+  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
+  vpmuldq    ymm3, ymm3, ymm11                   
+  vpmuldq    ymm5, ymm5, ymm11                   
+  vpmuldq    ymm7, ymm7, ymm11   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm1, 1                      // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V   
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm0, ymm12, ymm0 
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm3, 1                      // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm3, ymm2, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm2, ymm2, ymm13                   // a[j] = U + V  
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
+  vpermd     ymm3, ymm12, ymm3 
+  vpermd     ymm2, ymm12, ymm2 
+  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]
+
+  vmovdqu    ymm13, ymm5
+  vpand      ymm5, ymm14, ymm5                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm5, 1                      // 2*c0
+  vpsubd     ymm13, ymm5, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm5, ymm4, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm4, ymm4, ymm13                   // a[j] = U + V  
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
+  vpermd     ymm5, ymm12, ymm5 
+  vpermd     ymm4, ymm12, ymm4 
+  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j]
+
+  vmovdqu    ymm13, ymm7
+  vpand      ymm7, ymm14, ymm7                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm7, 1                      // 2*c0
+  vpsubd     ymm13, ymm7, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1    
+  vpsubd     ymm7, ymm6, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm6, ymm6, ymm13                   // a[j] = U + V 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
+  vpermd     ymm6, ymm12, ymm6   
+  vpermd     ymm7, ymm12, ymm7 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
+  
+  add        r10, r14
+  cmp        r10, r11
+  jl         loop3
+  inc        rdx
+  cmp        rdx, r9
+  jl         loop2
+  shl        r9, 1
+  cmp        r9, r12
+  jl         loop1
+   
+// Stage m=64
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+loop4:
+  vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k]
+  vpmovsxdq  ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
+  vpmuldq    ymm1, ymm1, ymm11                   // a[j+k].S
+  vpmuldq    ymm3, ymm3, ymm11                   // a[j+k].S
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                   // c0
+  vpsrlq     ymm13, ymm13, 12                    // c1
+  vpslld     ymm15, ymm1, 1                      // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                  // c0-c1
+  vpaddd     ymm13, ymm13, ymm15                 // V = 3*c0-c1 
+  
+  vmovdqu    ymm10, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm10, ymm10, 12                    // c1
+  vpslld     ymm15, ymm3, 1                      // 2*c0
+  vpsubd     ymm10, ymm3, ymm10                  // c0-c1
+  vpaddd     ymm10, ymm10, ymm15                 // V = 3*c0-c1    
+  
+  vpsubd     ymm1, ymm0, ymm13                   // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm13                   // a[j] = U + V    
+  vpsubd     ymm3, ymm2, ymm10                   // a[j+k] = U - V
+  vpaddd     ymm2, ymm2, ymm10                   // a[j] = U + V 
+  
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm2, ymm12, ymm2 
+  vpermd     ymm3, ymm12, ymm3 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
+  
+  add        r10, r14        // j+16 
+  inc        rdx             // i+1
+  cmp        rdx, r9
+  jl         loop4
+   
+// Stage m=128
+  shl        r9, 1
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+  mov        r13, 8 
+loop6:
+  vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+  vpmuldq    ymm1, ymm1, ymm2                    // a[j+k].S
+  
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm14, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                      // c1
+  vpslld     ymm4, ymm0, 1                       // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                    // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                    // U = 3*c0-c1    
+  
+  vmovdqu    ymm3, ymm1
+  vpand      ymm1, ymm14, ymm1                   // c0
+  vpsrlq     ymm4, ymm3, 24                      // c2
+  vpsrad     ymm3, ymm3, 12                      // xc1
+  vpand      ymm3, ymm14, ymm3                   // c1
+  vpslld     ymm5, ymm1, 3                       // 8*c0
+  vpaddd     ymm4, ymm1, ymm4                    // c0+c2
+  vpaddd     ymm4, ymm4, ymm5                    // 9*c0+c2
+  vpslld     ymm5, ymm3, 1                       // 2*c1
+  vpaddd     ymm1, ymm0, ymm3                    // U+c1
+  vpsubd     ymm0, ymm0, ymm3                    // U-c1
+  vpsubd     ymm4, ymm4, ymm5                    // 9*c0-2*c1+c2
+  vpaddd     ymm0, ymm0, ymm4                    // U+(9*c0-3*c1+c2)
+  vpsubd     ymm1, ymm1, ymm4                    // U-(9*c0-3*c1+c2)
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
+
+  add        r10, r13        // j+8
+  inc        rdx             // i+1
+  cmp        rdx, r9
+  jl         loop6
+
+// Stage m=256 
+  vmovdqu    ymm9, PERM02134657  
+  shl        r9, 1
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+  mov        r14, 32
+loop7:
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256]    // S = psi[m+i]->psi[m+i+3]
+  vpermq     ymm8, ymm2, 0x50   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]    // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+  
+  vpermq     ymm8, ymm2, 0xfa   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
+
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16]  // S = psi[m+i]->psi[m+i+3] 
+  vpermq     ymm8, ymm2, 0x50   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
+          
+  vpermq     ymm8, ymm2, 0xfa   
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10+96]  // U = a[j]->a[j+3]
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3]
+  vpermq     ymm3, ymm0, 0x4e    
+  vinserti128 ymm0, ymm0, xmm1, 1                // U
+  vpblendd   ymm1, ymm1, ymm3, 15
+  vpmuldq    ymm3, ymm1, ymm8                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
+         
+  add        r10, r14        // j+32
+  add        rdx, r13        // i+8
+  cmp        rdx, r9
+  jl         loop7
+
+// Stage m=512
+  vmovdqu    ymm9, PERM00224466
+  shl        r9, 1            // m = n/2 
+  xor        rdx, rdx         // i = 0
+  xor        r10, r10         // j1 = 0
+  mov        r14, 4
+loop8:
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]    // U = a[j]
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]  // a[j+k]
+  vpmuldq    ymm3, ymm1, ymm2                    // a[j+k].S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                   // c0
+  vpsrlq     ymm4, ymm4, 12                      // c1
+  vpslld     ymm5, ymm3, 1                       // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                    // c0-c1
+  vpaddd     ymm4, ymm4, ymm5                    // V = 3*c0-c1     
+  vpsubd     ymm1, ymm0, ymm4                    // a[j+k] = U - V
+  vpaddd     ymm0, ymm0, ymm4                    // a[j] = U + V 
+  vpermd     ymm1, ymm9, ymm1 
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+  
+  add        r10, r13        // j+8
+  add        rdx, r14        // i+4
+  cmp        rdx, r9
+  jl         loop8
+
+  pop        r14
+  pop        r13
+  pop        r12
+  ret
+
+
+//***********************************************************************
+//  Inverse NTT
+//  Operation: a [reg_p1] <- INTT(a) [reg_p1], 
+//             [reg_p2] points to table
+//             reg_p3 and reg_p4 point to constants for scaling and
+//             reg_p5 contains parameter n
+//*********************************************************************** 
+.global INTT_GS_rev2std_12289_asm
+INTT_GS_rev2std_12289_asm:
+  push       r12
+  push       r13
+  push       r14
+  push       r15
+  push       rbx
+
+// Stage m=1024
+  vmovdqu    ymm9, PERM00224466
+  vmovdqu    ymm14, MASK12x8  
+  mov        r12, reg_p5           
+  shr        r12, 1          // n/2 = 512
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+  mov        r13, 8
+  mov        r14, 4
+loop1b:
+  vmovdqu    ymm1, YMMWORD PTR [reg_p1+4*r10+4]       // V = a[j+k]    
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*512]   // S
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
+  vpermd     ymm1, ymm9, ymm1 
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+
+  add        r10, r13        // j+8
+  add        r15, r14        // i+4
+  cmp        r15, r12
+  jl         loop1b
+  
+// Stage m=512 
+  vmovdqu    ymm9, PERM02134657
+  vmovdqu    ymm13, PERM0145
+  vmovdqu    ymm15, PERM2367   
+  shr        r12, 1          // n/4 = 256
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+  mov        r14, 32
+loop2b:
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256]   // S = psi[m+i]->psi[m+i+3]
+  vpermq     ymm8, ymm2, 0x50   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10]         // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0 
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10], ymm0
+  
+  vpermq     ymm8, ymm2, 0xfa   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+32], ymm0
+
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] 
+  vpermq     ymm8, ymm2, 0x50   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+64]      // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+64], ymm0
+         
+  vpermq     ymm8, ymm2, 0xfa   
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*r10+96]      // U = a[j]->a[j+7]
+  vpermd     ymm1, ymm15, ymm0 
+  vpermd     ymm0, ymm13, ymm0  
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm8                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1
+  vpslldq    ymm1, ymm1, 4    
+  vpblendd   ymm0, ymm0, ymm1, 0xaa
+  vpermd     ymm0, ymm9, ymm0
+  vmovdqu    YMMWORD PTR [reg_p1+4*r10+96], ymm0
+         
+  add        r10, r14        // j+32
+  add        r15, r13        // i+8
+  cmp        r15, r12
+  jl         loop2b
+     
+// Stage m=256 
+  vmovdqu    ymm12, PERM0246   
+  shr        r12, 1          // n/8 = 128
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+loop3b:
+  vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128]   // S
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p1+4*r10+16]      // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpsubd     ymm3, ymm0, ymm1                         // U - V
+  vpaddd     ymm0, ymm0, ymm1                         // U + V 
+  vpmuldq    ymm3, ymm3, ymm2                         // (U - V).S
+  vmovdqu    ymm4, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm4, ymm4, 12                           // c1
+  vpslld     ymm5, ymm3, 1                            // 2*c0
+  vpsubd     ymm4, ymm3, ymm4                         // c0-c1
+  vpaddd     ymm1, ymm4, ymm5                         // 3*c0-c1 
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm1
+  
+  add        r10, r13        // j+8
+  inc        r15             // i+1
+  cmp        r15, r12
+  jl         loop3b
+     
+// Stage m=128
+  shr        r12, 1          // n/16 = 64
+  xor        r15, r15        // i = 0
+  xor        r10, r10        // j1 = 0
+  mov        r14, 16 
+loop4b:
+  vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64]   // S
+  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+32]     // V = a[j+k]
+  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r10+48]     // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
+  vpsubd     ymm1, ymm0, ymm13                        // U - V
+  vpaddd     ymm0, ymm0, ymm13                        // U + V 
+  vpsubd     ymm3, ymm2, ymm15                        // U - V
+  vpaddd     ymm2, ymm2, ymm15                        // U + V   
+  vpmuldq    ymm1, ymm1, ymm11                        // (U - V).S
+  vpmuldq    ymm3, ymm3, ymm11                        // (U - V).S
+  
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1    
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm3, 1                           // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
+  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
+  
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm2, ymm12, ymm2 
+  vpermd     ymm3, ymm12, ymm3 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm1
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm3
+  
+  add        r10, r14        // j+16 
+  inc        r15             // i+1
+  cmp        r15, r12
+  jl         loop4b
+  
+// Stages m=64 -> m=4  
+  mov        r9, 5            // 5 iterations
+  mov        rax, 8 
+loop5b:
+  shl        rax, 1          // k = 2*k
+  shr        r12, 1          // m/2
+  xor        r15, r15        // i = 0
+  xor        r8, r8        
+loop6b:
+  mov        r10, r8         // Load j1
+  mov        r11, rax
+  dec        r11
+  add        r11, r10        // j2
+  mov        r13, r12
+  add        r13, r15        // m/2+i
+  vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13]         // S
+  mov        rbx, 4
+
+loop7b:
+  mov        r13, r10
+  add        r13, rax         // j+k
+  vpmovsxdq  ymm10, XMMWORD PTR [reg_p1+4*r13]        // V = a[j+k]
+  vpmovsxdq  ymm11, XMMWORD PTR [reg_p1+4*r13+16]     // V = a[j+k]
+  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r13+32]     // V = a[j+k]
+  vpmovsxdq  ymm15, XMMWORD PTR [reg_p1+4*r13+48]     // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p1+4*r10+16]      // U = a[j]
+  vpmovsxdq  ymm4, XMMWORD PTR [reg_p1+4*r10+32]      // U = a[j]
+  vpmovsxdq  ymm6, XMMWORD PTR [reg_p1+4*r10+48]      // U = a[j]
+  
+  vpsubd     ymm1, ymm0, ymm10                        // U - V
+  vpaddd     ymm0, ymm0, ymm10                        // U + V 
+  vpsubd     ymm3, ymm2, ymm11                        // U - V
+  vpaddd     ymm2, ymm2, ymm11                        // U + V 
+  vpsubd     ymm5, ymm4, ymm13                        // U - V
+  vpaddd     ymm4, ymm4, ymm13                        // U + V 
+  vpsubd     ymm7, ymm6, ymm15                        // U - V
+  vpaddd     ymm6, ymm6, ymm15                        // U + V 
+
+  vpmuldq    ymm1, ymm1, ymm9                         // (U - V).S
+  vpmuldq    ymm3, ymm3, ymm9                   
+  vpmuldq    ymm5, ymm5, ymm9                   
+  vpmuldq    ymm7, ymm7, ymm9   
+  
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
+
+  cmp        r9, rbx 
+  jne        skip1
+  vmovdqu    ymm13, ymm0
+  vpand      ymm0, ymm14, ymm0                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm0, 1                           // 2*c0
+  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
+  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1
+skip1:
+  vpermd     ymm1, ymm12, ymm1 
+  vpermd     ymm0, ymm12, ymm0 
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm3, 1                           // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
+  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13], xmm1 
+
+  cmp        r9, rbx 
+  jne        skip2
+  vmovdqu    ymm13, ymm2
+  vpand      ymm2, ymm14, ymm2                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm2, 1                           // 2*c0
+  vpsubd     ymm13, ymm2, ymm13                       // c0-c1
+  vpaddd     ymm2, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm3
+  vpand      ymm3, ymm14, ymm3                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm3, 1                           // 2*c0
+  vpsubd     ymm13, ymm3, ymm13                       // c0-c1
+  vpaddd     ymm3, ymm13, ymm15                       // 3*c0-c1
+skip2:
+  vpermd     ymm3, ymm12, ymm3 
+  vpermd     ymm2, ymm12, ymm2 
+
+  vmovdqu    ymm13, ymm5
+  vpand      ymm5, ymm14, ymm5                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm5, 1                           // 2*c0
+  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
+  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+16], xmm2
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+16], xmm3 
+
+  cmp        r9, rbx 
+  jne        skip3
+  vmovdqu    ymm13, ymm4
+  vpand      ymm4, ymm14, ymm4                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm4, 1                           // 2*c0
+  vpsubd     ymm13, ymm4, ymm13                       // c0-c1
+  vpaddd     ymm4, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm5
+  vpand      ymm5, ymm14, ymm5                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm5, 1                           // 2*c0
+  vpsubd     ymm13, ymm5, ymm13                       // c0-c1
+  vpaddd     ymm5, ymm13, ymm15                       // 3*c0-c1
+skip3:
+  vpermd     ymm5, ymm12, ymm5 
+  vpermd     ymm4, ymm12, ymm4 
+
+  vmovdqu    ymm13, ymm7
+  vpand      ymm7, ymm14, ymm7                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm7, 1                           // 2*c0
+  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
+  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+32], xmm4
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+32], xmm5  
+
+  cmp        r9, rbx 
+  jne        skip4
+  vmovdqu    ymm13, ymm6
+  vpand      ymm6, ymm14, ymm6                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1       
+  vpslld     ymm15, ymm6, 1                           // 2*c0
+  vpsubd     ymm13, ymm6, ymm13                       // c0-c1
+  vpaddd     ymm6, ymm13, ymm15                       // 3*c0-c1
+
+  vmovdqu    ymm13, ymm7
+  vpand      ymm7, ymm14, ymm7                        // c0
+  vpsrad     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm7, 1                           // 2*c0
+  vpsubd     ymm13, ymm7, ymm13                       // c0-c1
+  vpaddd     ymm7, ymm13, ymm15                       // 3*c0-c1
+skip4:
+  vpermd     ymm7, ymm12, ymm7 
+  vpermd     ymm6, ymm12, ymm6   
+  vmovdqu    XMMWORD PTR [reg_p1+4*r13+48], xmm7
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+48], xmm6
+  
+  add        r10, r14
+  cmp        r10, r11
+  jl         loop7b
+  mov        rbx, rax
+  shl        rbx, 1          // 2*k
+  add        r8, rbx         // j1+2*k
+  inc        r15
+  cmp        r15, r12
+  jl         loop6b
+  dec        r9
+  jnz        loop5b
+       
+// Scaling step
+  shl        rax, 1          // k = 2*k = 512
+  xor        r10, r10        // j = 0
+  mov        r14, 4 
+  movq       xmm0, reg_p3
+  vbroadcastsd ymm10, xmm0                            // S = omegainv1N_rev
+  movq       xmm0, reg_p4
+  vbroadcastsd ymm11, xmm0                            // T = Ninv
+loop8b:
+  vpmovsxdq  ymm13, XMMWORD PTR [reg_p1+4*r10+4*512]  // V = a[j+k]
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*r10]         // U = a[j]
+  vpsubd     ymm1, ymm0, ymm13                        // U - V
+  vpaddd     ymm0, ymm0, ymm13                        // U + V  
+  vpmuldq    ymm1, ymm1, ymm10                        // (U - V).S
+  vpmuldq    ymm0, ymm0, ymm11                        // (U + V).T
+  
+  vmovdqu    ymm13, ymm0
+  vpand      ymm0, ymm14, ymm0                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm0, 1                           // 2*c0
+  vpsubd     ymm13, ymm0, ymm13                       // c0-c1
+  vpaddd     ymm0, ymm13, ymm15                       // 3*c0-c1    
+
+  vmovdqu    ymm13, ymm1
+  vpand      ymm1, ymm14, ymm1                        // c0
+  vpsrlq     ymm13, ymm13, 12                         // c1
+  vpslld     ymm15, ymm1, 1                           // 2*c0
+  vpsubd     ymm13, ymm1, ymm13                       // c0-c1
+  vpaddd     ymm1, ymm13, ymm15                       // 3*c0-c1 
+  
+  vpermd     ymm0, ymm12, ymm0 
+  vpermd     ymm1, ymm12, ymm1 
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p1+4*r10+4*512], xmm1
+  
+  add        r10, r14        // j+4 
+  cmp        r10, rax
+  jl         loop8b  
+loop9b:
+  pop        rbx
+  pop        r15
+  pop        r14
+  pop        r13
+  pop        r12
+  ret
+
+
+//***********************************************************************
+//  Component-wise multiplication and addition
+//  Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3]
+//             reg_p5 contains parameter n
+//*********************************************************************** 
+.global pmuladd_asm
+pmuladd_asm:
+  vmovdqu    ymm5, PERM0246
+  vmovdqu    ymm6, MASK12x8 
+  xor        rax, rax
+  movq       r11, 4
+lazo2:
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
+  vpmovsxdq  ymm2, XMMWORD PTR [reg_p3+4*rax]   // c
+  vpmuldq    ymm0, ymm1, ymm0 
+  vpaddq     ymm0, ymm2, ymm0                    
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrlq     ymm3, ymm3, 12                     // c1
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1       
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
+
+  vpermd     ymm0, ymm5, ymm0 
+  vmovdqu    XMMWORD PTR [reg_p4+4*rax], xmm0
+
+  add        rax, r11                           // j+4
+  cmp        rax, reg_p5
+  jl         lazo2
+  ret
+
+
+//***********************************************************************
+//  Component-wise multiplication
+//  Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2]
+//             reg_p4 contains parameter n
+//*********************************************************************** 
+.global pmul_asm
+pmul_asm: 
+  vmovdqu    ymm5, PERM0246
+  vmovdqu    ymm6, MASK12x8 
+  xor        rax, rax
+  movq       r11, 4
+lazo3:
+  vpmovsxdq  ymm0, XMMWORD PTR [reg_p1+4*rax]   // a
+  vpmovsxdq  ymm1, XMMWORD PTR [reg_p2+4*rax]   // b
+  vpmuldq    ymm0, ymm1, ymm0                    
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrlq     ymm3, ymm3, 12                     // c1
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1       
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
+
+  vpermd     ymm0, ymm5, ymm0 
+  vmovdqu    XMMWORD PTR [reg_p3+4*rax], xmm0
+
+  add        rax, r11                           // j+4
+  cmp        rax, reg_p4
+  jl         lazo3
+  ret
+
+
+//***********************************************************************
+//  Two consecutive reductions
+//  Operation: c [reg_p1] <- a [reg_p1]
+//             reg_p2 contains parameter n
+//*********************************************************************** 
+.global two_reduce12289_asm
+two_reduce12289_asm: 
+  vmovdqu    ymm6, MASK12x8 
+  vmovdqu    ymm7, PRIME8x
+  xor        rax, rax
+  movq       r11, 8
+lazo4:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1 
+
+  vmovdqu    ymm3, ymm0
+  vpand      ymm0, ymm6, ymm0                   // c0
+  vpsrad     ymm3, ymm3, 12                     // c1       
+  vpslld     ymm4, ymm0, 1                      // 2*c0
+  vpsubd     ymm3, ymm0, ymm3                   // c0-c1
+  vpaddd     ymm0, ymm3, ymm4                   // 3*c0-c1
+
+  vpsrad     ymm2, ymm0, 31
+  vpand      ymm2, ymm7, ymm2
+  vpaddd     ymm2, ymm0, ymm2
+  vpsubd     ymm0, ymm2, ymm7
+
+  vpsrad     ymm2, ymm0, 31
+  vpand      ymm2, ymm7, ymm2
+  vpaddd     ymm0, ymm0, ymm2
+
+  vmovdqu    YMMWORD PTR [reg_p1+4*rax], ymm0
+
+  add        rax, r11                           // j+8
+  cmp        rax, reg_p2
+  jl         lazo4
+  ret
+
+
+//***********************************************************************
+//  Encoding
+//  Operation: c [reg_p2] <- a [reg_p1]
+//*********************************************************************** 
+.global encode_asm
+encode_asm: 
+  vmovdqu    ymm6, MASK32 
+  vmovdqu    ymm7, MASK42
+  mov        r9, 1024
+  xor        rax, rax
+  xor        r10, r10
+  mov        r11, 14
+  mov        rcx, 8
+lazo5:
+  vmovdqu    ymm0, YMMWORD PTR [reg_p1+4*rax]   // a
+
+  vpsrlq     ymm1, ymm0, 18  
+  vpsllq     ymm2, ymm0, 4
+  vpand      ymm0, ymm0, ymm6
+  vpsrldq    ymm2, ymm2, 5   
+  vpsrlq     ymm3, ymm1, 4
+  vpand      ymm1, ymm1, ymm6
+  vpand      ymm2, ymm2, ymm7
+  vpsrldq    ymm3, ymm3, 4 
+  vpor       ymm0, ymm0, ymm1
+  vpor       ymm0, ymm0, ymm2 
+  vpor       ymm0, ymm0, ymm3 
+  vpermq     ymm1, ymm0, 0x0e   
+
+  vmovdqu    XMMWORD PTR [reg_p2+r10], xmm0
+  vmovdqu    XMMWORD PTR [reg_p2+r10+7], xmm1
+
+  add        r10, r11
+  add        rax, rcx        // j+8
+  cmp        rax, r9
+  jl         lazo5
+  ret
+
+
+//***********************************************************************
+//  Decoding
+//  Operation: c [reg_p2] <- a [reg_p1]
+//*********************************************************************** 
+.global decode_asm
+decode_asm: 
+  vmovdqu    ymm6, MASK14_1 
+  vmovdqu    ymm7, MASK14_2
+  vmovdqu    ymm8, MASK14_3
+  vmovdqu    ymm9, MASK14_4
+  mov        r9, 1024
+  xor        rax, rax
+  xor        r10, r10
+  mov        r11, 14
+  mov        rcx, 8
+lazo6:
+  vmovdqu    xmm0, XMMWORD PTR [reg_p1+r10]
+  vmovdqu    xmm1, XMMWORD PTR [reg_p1+r10+7]
+  vinserti128 ymm0, ymm0, xmm1, 1               
+
+  vpand      ymm1, ymm0, ymm6
+  vpand      ymm2, ymm0, ymm7
+  vpand      ymm3, ymm0, ymm8
+  vpand      ymm4, ymm0, ymm9
+   
+  vpsllq     ymm2, ymm2, 18 
+  vpsllq     ymm3, ymm3, 4
+  vpslldq    ymm3, ymm3, 4 
+  vpsrlq     ymm4, ymm4, 2
+  vpslldq    ymm4, ymm4, 7
+
+  vpor       ymm1, ymm1, ymm2 
+  vpor       ymm1, ymm1, ymm3 
+  vpor       ymm1, ymm1, ymm4 
+  
+  vmovdqu    YMMWORD PTR [reg_p2+4*rax], ymm1   
+
+  add        r10, r11
+  add        rax, rcx            // j+8
+  cmp        rax, r9
+  jl         lazo6
   ret
\ No newline at end of file
diff --git a/dap-sdk/crypto/src/msrln/kex.c b/dap-sdk/crypto/src/msrln/kex.c
index e2c6b317ec..99a8db3962 100755
--- a/dap-sdk/crypto/src/msrln/kex.c
+++ b/dap-sdk/crypto/src/msrln/kex.c
@@ -1,645 +1,642 @@
-ï»¿#include "msrln_priv.h"
-#if (OS_TARGET == OS_MACOS)
-    #include <stdio.h>
-#else
-    #include <malloc.h>
-#endif
-
-#include "KeccakHash.h"
-#include "SimpleFIPS202.h"
-
-
-// N^-1 * prime_scale^-8
-const int32_t MSRLN_Ninv8_ntt1024_12289 = 8350;
-// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1]
-const int32_t MSRLN_omegainv7N_rev_ntt1024_12289 = 795;
-// N^-1 * prime_scale^-11
-const int32_t MSRLN_Ninv11_ntt1024_12289 = 2585;
-// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1]
-const int32_t MSRLN_omegainv10N_rev_ntt1024_12289 = 10953;
-
-
-// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy),
-// where xxx is parameter N and yyy is the prime q.
-
-const int32_t MSRLN_psi_rev_ntt1024_12289[1024] = {
-8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201,
-875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859,
-7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626,
-3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042,
-3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563,
-7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266,
-5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934,
-8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842,
-11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541,
-11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969,
-8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863,
-8974, 9551, 5868, 9634, 5735,11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333,
-10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455,7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656,
-64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449,
-4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033,
-8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912,
-1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029,
-6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105,
-8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993,
-3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763,
-125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187,
-3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063,
-4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377,
-4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170,
-10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982,
-6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550,
-8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120,
-6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769,
-4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000,
-5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789,
-10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946,
-6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578,
-7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259,
-3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790,
-5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253,
-4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939
-};
-
-
-const int32_t MSRLN_omegainv_rev_ntt1024_12289[1024] = {
-8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422,
-6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270,
-2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957,
-8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372,
-10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300,
-5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534,
-145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567,
-6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170,
-10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404,
-1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162,
-12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643,
-6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018,
-1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239,
-11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257,
-3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919,
-8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483,
-6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447,
-6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036,
-1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038,
-6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120,
-2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626,
-11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015,
-10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570,
-6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273,
-8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767,
-8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468,
-3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906,
-6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492,
-8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635,
-9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392,
-5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520,
-3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566,
-5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678,
-4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873,
-2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448
-};
-
-
-const int32_t MSRLN_psi_rev_ntt512_12289[512] = {
-8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607,
-4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390,
-11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437,
-12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178,
-1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790,
-2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810,
-1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863,
-10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893,
-7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517,
-4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757,
-9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297,
-6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633,
-12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540,
-5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367,
-8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369,
-9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600,
-3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184
-};
-
-
-const int32_t MSRLN_omegainv_rev_ntt512_12289[512] = {
-8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302,
-8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752,
-12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186,
-11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247,
-9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821,
-11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548,
-4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023,
-2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255,
-11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624,
-9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169,
-3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035,
-3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752,
-4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656,
-2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404,
-325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975,
-10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908,
-11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434
-};
-
-// import external code
-#ifdef RLWE_ASM_AVX2
-    #include "AMD64/consts.c"
-    #include "AMD64/ntt_x64.c"
-#else
-    #include "generic/ntt.c"
-#endif
-
-__inline void clear_words(void* mem, digit_t nwords)
-{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
-  // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing.
-    unsigned int i;
-    volatile digit_t *v = mem; 
-
-    for (i = 0; i < nwords; i++) {
-        v[i] = 0;
-    }
-}
-
-
-CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction)
-{ // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
-
-    pLatticeCrypto->RandomBytesFunction = RandomBytesFunction;
-    pLatticeCrypto->ExtendableOutputFunction = ExtendableOutputFunction;
-    pLatticeCrypto->StreamOutputFunction = StreamOutputFunction;
-
-    return CRYPTO_MSRLN_SUCCESS;
-}
-
-
-PLatticeCryptoStruct LatticeCrypto_allocate()
-{ // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). 
-  // Returns NULL on error.
-    PLatticeCryptoStruct LatticeCrypto = NULL;
-
-    LatticeCrypto = (PLatticeCryptoStruct)calloc(1, sizeof(LatticeCryptoStruct));
-
-    if (LatticeCrypto == NULL) {
-        return NULL;
-    }
-    return LatticeCrypto;
-}
-
-
-const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status)
-{ // Output error/success message for a given CRYPTO_STATUS
-    struct error_mapping {
-        unsigned int index;
-        char*        string;
-    } mapping[CRYPTO_STATUS_TYPE_SIZE] = {
-        {CRYPTO_MSRLN_SUCCESS, CRYPTO_MSG_SUCCESS},
-        {CRYPTO_MSRLN_ERROR, CRYPTO_MSG_ERROR},
-        {CRYPTO_MSRLN_ERROR_DURING_TEST, CRYPTO_MSG_ERROR_DURING_TEST},
-        {CRYPTO_MSRLN_ERROR_UNKNOWN, CRYPTO_MSG_ERROR_UNKNOWN},
-        {CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, CRYPTO_MSG_ERROR_NOT_IMPLEMENTED},
-        {CRYPTO_MSRLN_ERROR_NO_MEMORY, CRYPTO_MSG_ERROR_NO_MEMORY},
-        {CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, CRYPTO_MSG_ERROR_INVALID_PARAMETER},
-        {CRYPTO_MSRLN_ERROR_SHARED_KEY, CRYPTO_MSG_ERROR_SHARED_KEY},
-        {CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS}
-    };
-
-    if (Status >= CRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) {
-        return "Unrecognized CRYPTO_STATUS";
-    } else {
-        return mapping[Status].string;
-    }
-};
-
-
-void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m)
-{ // Alice's message encoding
-    unsigned int i = 0, j;
-        
-#if defined(GENERIC_IMPLEMENTATION)
-    for (j = 0; j < 1024; j += 4) {        
-        m[i]   = (unsigned char)(pk[j] & 0xFF);
-        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
-        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
-        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
-        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
-        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
-        m[i+6] = (unsigned char)(pk[j+3] >> 6);
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    encode_asm(pk, m);
-    i = 1792;
-#endif
-
-    for (j = 0; j < 32; j++) {
-        m[i+j] = seed[j];
-    }
-}
-
-
-void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed)
-{ // Alice's message decoding 
-    unsigned int i = 0, j;
-    
-#if defined(GENERIC_IMPLEMENTATION)
-    for (j = 0; j < 1024; j += 4) {        
-        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
-        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
-        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
-        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    decode_asm(m, pk);
-    i = 1792;
-#endif
-
-    for (j = 0; j < 32; j++) {
-        seed[j] = m[i+j];
-    }
-}
-
-
-void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m)
-{ // Bob's message encoding
-    unsigned int i = 0, j;
-    
-#if defined(GENERIC_IMPLEMENTATION) 
-    for (j = 0; j < 1024; j += 4) {        
-        m[i]   = (unsigned char)(pk[j] & 0xFF);
-        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
-        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
-        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
-        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
-        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
-        m[i+6] = (unsigned char)(pk[j+3] >> 6);
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    encode_asm(pk, m);
-#endif
-
-    i = 0;
-    for (j = 0; j < 1024/4; j++) {
-        m[1792+j] = (unsigned char)(rvec[i] | (rvec[i+1] << 2) | (rvec[i+2] << 4) | (rvec[i+3] << 6));
-        i += 4;
-    }
-}
-
-
-void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec)
-{ // Bob's message decoding
-    unsigned int i = 0, j;
-    
-#if defined(GENERIC_IMPLEMENTATION) 
-    for (j = 0; j < 1024; j += 4) {        
-        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
-        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
-        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
-        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
-        i += 7;
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    decode_asm(m, pk);
-    i = 1792;
-#endif
-    
-    i = 0;
-    for (j = 0; j < 1024/4; j++) {
-        rvec[i]   = (uint32_t)(m[1792+j] & 0x03);
-        rvec[i+1] = (uint32_t)((m[1792+j] >> 2) & 0x03);
-        rvec[i+2] = (uint32_t)((m[1792+j] >> 4) & 0x03);
-        rvec[i+3] = (uint32_t)(m[1792+j] >> 6);
-        i += 4;
-    }
-}
-
-
-static __inline uint32_t Abs(int32_t value)
-{ // Compute absolute value
-    uint32_t mask;
-
-    mask = (uint32_t)(value >> 31);
-    return ((mask ^ value) - mask);
-}
-
-
-CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
-{ // Reconciliation helper
-    (void)seed; (void)nonce; (void)StreamOutputFunction;
-    unsigned int i, j, norm;
-    unsigned char bit, random_bits[32];
-    uint32_t v0[4], v1[4];
-
-    randombytes( random_bits, 32);
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
-
-#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)         
-    helprec_asm(x, rvec, random_bits);
-#else   
-
-    for (i = 0; i < 256; i++) {
-        bit = 1 & (random_bits[i >> 3] >> (i & 0x07));
-        rvec[i]     = (x[i]     << 1) - bit;  
-        rvec[i+256] = (x[i+256] << 1) - bit;
-        rvec[i+512] = (x[i+512] << 1) - bit;
-        rvec[i+768] = (x[i+768] << 1) - bit; 
-
-        norm = 0;
-        v0[0] = 4; v0[1] = 4; v0[2] = 4; v0[3] = 4;
-        v1[0] = 3; v1[1] = 3; v1[2] = 3; v1[3] = 3; 
-        for (j = 0; j < 4; j++) {
-            v0[j] -= (rvec[i+256*j] - PARAMETER_Q4 ) >> 31;
-            v0[j] -= (rvec[i+256*j] - PARAMETER_3Q4) >> 31;
-            v0[j] -= (rvec[i+256*j] - PARAMETER_5Q4) >> 31;
-            v0[j] -= (rvec[i+256*j] - PARAMETER_7Q4) >> 31;
-            v1[j] -= (rvec[i+256*j] - PARAMETER_Q2 ) >> 31;
-            v1[j] -= (rvec[i+256*j] - PARAMETER_Q  ) >> 31;
-            v1[j] -= (rvec[i+256*j] - PARAMETER_3Q2) >> 31;
-            norm += Abs(2*rvec[i+256*j] - PARAMETER_Q*v0[j]);
-        }
-
-        norm = (uint32_t)((int32_t)(norm - PARAMETER_Q) >> 31);    // If norm < q then norm = 0xff...ff, else norm = 0
-        v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0];
-        v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1];
-        v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2];
-        v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3];
-        rvec[i]     = (v0[0] - v0[3]) & 0x03;
-        rvec[i+256] = (v0[1] - v0[3]) & 0x03;
-        rvec[i+512] = (v0[2] - v0[3]) & 0x03;
-        rvec[i+768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03;
-    }
-#endif
-
-    return Status;
-}
-
-
-static __inline uint32_t LDDecode(int32_t* t)
-{ // Low-density decoding
-    unsigned int i, norm = 0;
-    uint32_t mask1, mask2, value;
-    int32_t cneg = -8*PARAMETER_Q;
-    
-    for (i = 0; i < 4; i++) { 
-        mask1 = t[i] >> 31;                                    // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0
-        mask2 = (4*PARAMETER_Q - (int32_t)Abs(t[i])) >> 31;    // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff
-
-        value = ((mask1 & (8*PARAMETER_Q ^ cneg)) ^ cneg);
-        norm += Abs(t[i] + (mask2 & value));
-    }
-
-    return ((8*PARAMETER_Q - norm) >> 31) ^ 1;                 // If norm < PARAMETER_Q then return 1, else return 0
-}
-
-
-void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key)               
-{ // Reconciliation
-
-#if defined(GENERIC_IMPLEMENTATION)
-    unsigned int i;
-    uint32_t t[4];
-
-    for (i = 0; i < 32; i++) {
-        key[i] = 0;
-    }
-    for (i = 0; i < 256; i++) {        
-        t[0] = 8*x[i]     - (2*rvec[i] + rvec[i+768]) * PARAMETER_Q;
-        t[1] = 8*x[i+256] - (2*rvec[i+256] + rvec[i+768]) * PARAMETER_Q;
-        t[2] = 8*x[i+512] - (2*rvec[i+512] + rvec[i+768]) * PARAMETER_Q;
-        t[3] = 8*x[i+768] - (rvec[i+768]) * PARAMETER_Q;
-      
-        key[i >> 3] |= (unsigned char)LDDecode((int32_t*)t) << (i & 0x07);
-    }
-    
-#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
-    rec_asm(x, rvec, key);
-#endif
-}
-
-
-CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
-{ // Error sampling
-    (void) seed; (void) nonce; (void) StreamOutputFunction;
-    unsigned char stream[3 * PARAMETER_N];
-    uint32_t *pstream = (uint32_t *) &stream;
-    uint32_t acc1, acc2, temp;
-    uint8_t *pacc1 = (uint8_t *) &acc1, *pacc2 = (uint8_t *) &acc2;
-    unsigned int i, j;
-
-    randombytes( stream, 3 * PARAMETER_N);
-
-#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)
-    error_sampling_asm(stream, e);
-#else
-    for (i = 0; i < PARAMETER_N / 4; i++) {
-        acc1 = 0;
-        acc2 = 0;
-        for (j = 0; j < 8; j++) {
-            acc1 += (pstream[i] >> j) & 0x01010101;
-            acc2 += (pstream[i + PARAMETER_N / 4] >> j) & 0x01010101;
-        }
-        for (j = 0; j < 4; j++) {
-            temp = pstream[i + 2 * PARAMETER_N / 4] >> j;
-            acc1 += temp & 0x01010101;
-            acc2 += (temp >> 4) & 0x01010101;
-        }
-        e[2 * i] = pacc1[0] - pacc1[1];
-        e[2 * i + 1] = pacc1[2] - pacc1[3];
-        e[2 * i + PARAMETER_N / 2] = pacc2[0] - pacc2[1];
-        e[2 * i + PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3];
-    }
-#endif
-
-    return CRYPTO_MSRLN_SUCCESS;    
-}
-
-
-CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction)
-{ // Generation of parameter a
-    (void)ExtendableOutputFunction;
-    unsigned int pos = 0, ctr = 0;
-    uint16_t val;
-    unsigned int nblocks = 16;
-    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
-    //Keccak_HashInstance ks;
-
-    uint64_t state[SHA3_STATESIZE] = {0};
-    shake128_absorb(state, seed, SEED_BYTES);
-    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-
-    /*#ifdef _WIN32
-        SHAKE128_InitAbsorb( &ks, seed, SEED_BYTES );
-        KECCAK_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 );
-    #else 
-        Keccak_HashInitialize_SHAKE128(&ks);
-        Keccak_HashUpdate( &ks, seed, SEED_BYTES * 8 );
-        Keccak_HashFinal( &ks, seed );
-        Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
-    //#endif
-    */
-    while (ctr < PARAMETER_N) {
-        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
-        if (val < PARAMETER_Q) {
-            a[ctr++] = val;
-        }
-        pos += 2;
-        if (pos > SHAKE128_RATE * nblocks - 2) {
-            nblocks = 1;
-          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
-            pos = 0;
-        }
-    }
-
-    return CRYPTO_MSRLN_SUCCESS;    
-}
-
-
-CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto)
-{ // Alice's key generation  
-  // It produces a private key SecretKeyA and computes the public key PublicKeyA.
-  // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-  //          the public key PublicKeyA that occupies 1824 bytes
-  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-    uint32_t a[PARAMETER_N];
-    int32_t e[PARAMETER_N];
-    unsigned char seed[SEED_BYTES];
-    unsigned char error_seed[ERROR_SEED_BYTES];
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
-
-    Status = randombytes( seed, SEED_BYTES);
-
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        return Status;
-    }   
-
-    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-
-    Status = get_error(SecretKeyA, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);   
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-    NTT_CT_std2rev_12289(SecretKeyA, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    smul(e, 3, PARAMETER_N);
-
-    pmuladd((int32_t*)a, SecretKeyA, e, (int32_t*)a, PARAMETER_N); 
-    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
-    encode_A(a, seed, PublicKeyA);
-    
-cleanup:
-    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
-
-    return Status;
-}
-
-
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto)
-{ // Bob's key generation and shared secret computation  
-  // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
-  // the shared secret SharedSecretB.
-  // Input:   Alice's public key PublicKeyA that consists of 1824 bytes
-  // Outputs: the public key PublicKeyB that occupies 2048 bytes.
-  //          the 256-bit shared secret SharedSecretB.
-  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-    uint32_t pk_A[PARAMETER_N], a[PARAMETER_N], v[PARAMETER_N], r[PARAMETER_N];
-    int32_t sk_B[PARAMETER_N], e[PARAMETER_N];
-    unsigned char seed[SEED_BYTES], error_seed[ERROR_SEED_BYTES];
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
-
-    decode_A(PublicKeyA, pk_A, seed);
-
-    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-
-    Status = get_error(sk_B, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }
-    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }   
-    NTT_CT_std2rev_12289(sk_B, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    smul(e, 3, PARAMETER_N);
-
-    pmuladd((int32_t*)a, sk_B, e, (int32_t*)a, PARAMETER_N); 
-    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
-     
-    Status = get_error(e, error_seed, 2, pLatticeCrypto->StreamOutputFunction);  
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }   
-    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
-    smul(e, 81, PARAMETER_N);
-    
-    pmuladd((int32_t*)pk_A, sk_B, e, (int32_t*)v, PARAMETER_N);    
-    INTT_GS_rev2std_12289((int32_t*)v, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
-    two_reduce12289((int32_t*)v, PARAMETER_N);
-#if defined(GENERIC_IMPLEMENTATION)
-    correction((int32_t*)v, PARAMETER_Q, PARAMETER_N); 
-#endif
-
-    Status = HelpRec(v, r, error_seed, 3, pLatticeCrypto->StreamOutputFunction); 
-    if (Status != CRYPTO_MSRLN_SUCCESS) {
-        goto cleanup;
-    }   
-    Rec(v, r, SharedSecretB);
-    encode_B(a, r, PublicKeyB);
-    
-cleanup:
-    clear_words((void*)sk_B, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
-    clear_words((void*)a, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)v, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
-
-    return Status;
-}
-
-
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA)
-{ // Alice's shared secret computation  
-  // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
-  // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
-  //         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-  // Output: the 256-bit shared secret SharedSecretA.
-  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-    uint32_t u[PARAMETER_N], r[PARAMETER_N];
-    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
-
-    decode_B(PublicKeyB, u, r);
-    
-    pmul(SecretKeyA, (int32_t*)u, (int32_t*)u, PARAMETER_N);       
-    INTT_GS_rev2std_12289((int32_t*)u, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
-    two_reduce12289((int32_t*)u, PARAMETER_N);
-#if defined(GENERIC_IMPLEMENTATION)
-    correction((int32_t*)u, PARAMETER_Q, PARAMETER_N); 
-#endif
-
-    Rec(u, r, SharedSecretA);
-    
-// Cleanup
-    clear_words((void*)u, NBYTES_TO_NWORDS(4*PARAMETER_N));
-    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
-
-    return Status;
-}
+#include <stdio.h>
+#include <stdlib.h>
+#include "msrln_priv.h"
+
+#include "KeccakHash.h"
+#include "SimpleFIPS202.h"
+
+
+// N^-1 * prime_scale^-8
+const int32_t MSRLN_Ninv8_ntt1024_12289 = 8350;
+// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1]
+const int32_t MSRLN_omegainv7N_rev_ntt1024_12289 = 795;
+// N^-1 * prime_scale^-11
+const int32_t MSRLN_Ninv11_ntt1024_12289 = 2585;
+// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1]
+const int32_t MSRLN_omegainv10N_rev_ntt1024_12289 = 10953;
+
+
+// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy),
+// where xxx is parameter N and yyy is the prime q.
+
+const int32_t MSRLN_psi_rev_ntt1024_12289[1024] = {
+8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201,
+875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859,
+7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626,
+3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042,
+3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563,
+7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266,
+5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934,
+8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842,
+11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541,
+11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969,
+8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863,
+8974, 9551, 5868, 9634, 5735,11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333,
+10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455,7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656,
+64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449,
+4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033,
+8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912,
+1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029,
+6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105,
+8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993,
+3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763,
+125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187,
+3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063,
+4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377,
+4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170,
+10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982,
+6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550,
+8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120,
+6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769,
+4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000,
+5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789,
+10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946,
+6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578,
+7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259,
+3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790,
+5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253,
+4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939
+};
+
+
+const int32_t MSRLN_omegainv_rev_ntt1024_12289[1024] = {
+8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422,
+6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270,
+2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957,
+8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372,
+10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300,
+5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534,
+145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567,
+6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170,
+10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404,
+1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162,
+12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643,
+6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018,
+1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239,
+11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257,
+3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919,
+8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483,
+6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447,
+6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036,
+1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038,
+6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120,
+2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626,
+11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015,
+10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570,
+6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273,
+8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767,
+8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468,
+3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906,
+6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492,
+8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635,
+9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392,
+5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520,
+3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566,
+5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678,
+4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873,
+2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448
+};
+
+
+const int32_t MSRLN_psi_rev_ntt512_12289[512] = {
+8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607,
+4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390,
+11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437,
+12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178,
+1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790,
+2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810,
+1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863,
+10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893,
+7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517,
+4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757,
+9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297,
+6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633,
+12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540,
+5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367,
+8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369,
+9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600,
+3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184
+};
+
+
+const int32_t MSRLN_omegainv_rev_ntt512_12289[512] = {
+8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302,
+8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752,
+12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186,
+11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247,
+9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821,
+11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548,
+4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023,
+2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255,
+11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624,
+9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169,
+3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035,
+3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752,
+4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656,
+2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404,
+325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975,
+10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908,
+11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434
+};
+
+// import external code
+#ifdef RLWE_ASM_AVX2
+    #include "AMD64/consts.c"
+    #include "AMD64/ntt_x64.c"
+#else
+    #include "generic/ntt.c"
+#endif
+
+__inline void clear_words(void* mem, digit_t nwords)
+{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
+  // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing.
+    unsigned int i;
+    volatile digit_t *v = mem; 
+
+    for (i = 0; i < nwords; i++) {
+        v[i] = 0;
+    }
+}
+
+
+CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction)
+{ // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
+
+    pLatticeCrypto->RandomBytesFunction = RandomBytesFunction;
+    pLatticeCrypto->ExtendableOutputFunction = ExtendableOutputFunction;
+    pLatticeCrypto->StreamOutputFunction = StreamOutputFunction;
+
+    return CRYPTO_MSRLN_SUCCESS;
+}
+
+
+PLatticeCryptoStruct LatticeCrypto_allocate()
+{ // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). 
+  // Returns NULL on error.
+    PLatticeCryptoStruct LatticeCrypto = NULL;
+
+    LatticeCrypto = (PLatticeCryptoStruct)calloc(1, sizeof(LatticeCryptoStruct));
+
+    if (LatticeCrypto == NULL) {
+        return NULL;
+    }
+    return LatticeCrypto;
+}
+
+
+const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status)
+{ // Output error/success message for a given CRYPTO_STATUS
+    struct error_mapping {
+        unsigned int index;
+        char*        string;
+    } mapping[CRYPTO_STATUS_TYPE_SIZE] = {
+        {CRYPTO_MSRLN_SUCCESS, CRYPTO_MSG_SUCCESS},
+        {CRYPTO_MSRLN_ERROR, CRYPTO_MSG_ERROR},
+        {CRYPTO_MSRLN_ERROR_DURING_TEST, CRYPTO_MSG_ERROR_DURING_TEST},
+        {CRYPTO_MSRLN_ERROR_UNKNOWN, CRYPTO_MSG_ERROR_UNKNOWN},
+        {CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, CRYPTO_MSG_ERROR_NOT_IMPLEMENTED},
+        {CRYPTO_MSRLN_ERROR_NO_MEMORY, CRYPTO_MSG_ERROR_NO_MEMORY},
+        {CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, CRYPTO_MSG_ERROR_INVALID_PARAMETER},
+        {CRYPTO_MSRLN_ERROR_SHARED_KEY, CRYPTO_MSG_ERROR_SHARED_KEY},
+        {CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS}
+    };
+
+    if (Status >= CRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) {
+        return "Unrecognized CRYPTO_STATUS";
+    } else {
+        return mapping[Status].string;
+    }
+};
+
+
+void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m)
+{ // Alice's message encoding
+    unsigned int i = 0, j;
+        
+#if defined(GENERIC_IMPLEMENTATION)
+    for (j = 0; j < 1024; j += 4) {        
+        m[i]   = (unsigned char)(pk[j] & 0xFF);
+        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
+        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
+        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
+        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
+        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
+        m[i+6] = (unsigned char)(pk[j+3] >> 6);
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    encode_asm(pk, m);
+    i = 1792;
+#endif
+
+    for (j = 0; j < 32; j++) {
+        m[i+j] = seed[j];
+    }
+}
+
+
+void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed)
+{ // Alice's message decoding 
+    unsigned int i = 0, j;
+    
+#if defined(GENERIC_IMPLEMENTATION)
+    for (j = 0; j < 1024; j += 4) {        
+        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
+        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
+        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
+        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    decode_asm(m, pk);
+    i = 1792;
+#endif
+
+    for (j = 0; j < 32; j++) {
+        seed[j] = m[i+j];
+    }
+}
+
+
+void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m)
+{ // Bob's message encoding
+    unsigned int i = 0, j;
+    
+#if defined(GENERIC_IMPLEMENTATION) 
+    for (j = 0; j < 1024; j += 4) {        
+        m[i]   = (unsigned char)(pk[j] & 0xFF);
+        m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6));
+        m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF);
+        m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4));
+        m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF);
+        m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2));
+        m[i+6] = (unsigned char)(pk[j+3] >> 6);
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    encode_asm(pk, m);
+#endif
+
+    i = 0;
+    for (j = 0; j < 1024/4; j++) {
+        m[1792+j] = (unsigned char)(rvec[i] | (rvec[i+1] << 2) | (rvec[i+2] << 4) | (rvec[i+3] << 6));
+        i += 4;
+    }
+}
+
+
+void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec)
+{ // Bob's message decoding
+    unsigned int i = 0, j;
+    
+#if defined(GENERIC_IMPLEMENTATION) 
+    for (j = 0; j < 1024; j += 4) {        
+        pk[j]   = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8));
+        pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10));
+        pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12));
+        pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6));
+        i += 7;
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    decode_asm(m, pk);
+    i = 1792;
+#endif
+    
+    i = 0;
+    for (j = 0; j < 1024/4; j++) {
+        rvec[i]   = (uint32_t)(m[1792+j] & 0x03);
+        rvec[i+1] = (uint32_t)((m[1792+j] >> 2) & 0x03);
+        rvec[i+2] = (uint32_t)((m[1792+j] >> 4) & 0x03);
+        rvec[i+3] = (uint32_t)(m[1792+j] >> 6);
+        i += 4;
+    }
+}
+
+
+static __inline uint32_t Abs(int32_t value)
+{ // Compute absolute value
+    uint32_t mask;
+
+    mask = (uint32_t)(value >> 31);
+    return ((mask ^ value) - mask);
+}
+
+
+CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
+{ // Reconciliation helper
+    (void)seed; (void)nonce; (void)StreamOutputFunction;
+    unsigned int i, j, norm;
+    unsigned char bit, random_bits[32];
+    uint32_t v0[4], v1[4];
+
+    randombytes( random_bits, 32);
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
+
+#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)         
+    helprec_asm(x, rvec, random_bits);
+#else   
+
+    for (i = 0; i < 256; i++) {
+        bit = 1 & (random_bits[i >> 3] >> (i & 0x07));
+        rvec[i]     = (x[i]     << 1) - bit;  
+        rvec[i+256] = (x[i+256] << 1) - bit;
+        rvec[i+512] = (x[i+512] << 1) - bit;
+        rvec[i+768] = (x[i+768] << 1) - bit; 
+
+        norm = 0;
+        v0[0] = 4; v0[1] = 4; v0[2] = 4; v0[3] = 4;
+        v1[0] = 3; v1[1] = 3; v1[2] = 3; v1[3] = 3; 
+        for (j = 0; j < 4; j++) {
+            v0[j] -= (rvec[i+256*j] - PARAMETER_Q4 ) >> 31;
+            v0[j] -= (rvec[i+256*j] - PARAMETER_3Q4) >> 31;
+            v0[j] -= (rvec[i+256*j] - PARAMETER_5Q4) >> 31;
+            v0[j] -= (rvec[i+256*j] - PARAMETER_7Q4) >> 31;
+            v1[j] -= (rvec[i+256*j] - PARAMETER_Q2 ) >> 31;
+            v1[j] -= (rvec[i+256*j] - PARAMETER_Q  ) >> 31;
+            v1[j] -= (rvec[i+256*j] - PARAMETER_3Q2) >> 31;
+            norm += Abs(2*rvec[i+256*j] - PARAMETER_Q*v0[j]);
+        }
+
+        norm = (uint32_t)((int32_t)(norm - PARAMETER_Q) >> 31);    // If norm < q then norm = 0xff...ff, else norm = 0
+        v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0];
+        v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1];
+        v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2];
+        v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3];
+        rvec[i]     = (v0[0] - v0[3]) & 0x03;
+        rvec[i+256] = (v0[1] - v0[3]) & 0x03;
+        rvec[i+512] = (v0[2] - v0[3]) & 0x03;
+        rvec[i+768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03;
+    }
+#endif
+
+    return Status;
+}
+
+
+static __inline uint32_t LDDecode(int32_t* t)
+{ // Low-density decoding
+    unsigned int i, norm = 0;
+    uint32_t mask1, mask2, value;
+    int32_t cneg = -8*PARAMETER_Q;
+    
+    for (i = 0; i < 4; i++) { 
+        mask1 = t[i] >> 31;                                    // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0
+        mask2 = (4*PARAMETER_Q - (int32_t)Abs(t[i])) >> 31;    // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff
+
+        value = ((mask1 & (8*PARAMETER_Q ^ cneg)) ^ cneg);
+        norm += Abs(t[i] + (mask2 & value));
+    }
+
+    return ((8*PARAMETER_Q - norm) >> 31) ^ 1;                 // If norm < PARAMETER_Q then return 1, else return 0
+}
+
+
+void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key)               
+{ // Reconciliation
+
+#if defined(GENERIC_IMPLEMENTATION)
+    unsigned int i;
+    uint32_t t[4];
+
+    for (i = 0; i < 32; i++) {
+        key[i] = 0;
+    }
+    for (i = 0; i < 256; i++) {        
+        t[0] = 8*x[i]     - (2*rvec[i] + rvec[i+768]) * PARAMETER_Q;
+        t[1] = 8*x[i+256] - (2*rvec[i+256] + rvec[i+768]) * PARAMETER_Q;
+        t[2] = 8*x[i+512] - (2*rvec[i+512] + rvec[i+768]) * PARAMETER_Q;
+        t[3] = 8*x[i+768] - (rvec[i+768]) * PARAMETER_Q;
+      
+        key[i >> 3] |= (unsigned char)LDDecode((int32_t*)t) << (i & 0x07);
+    }
+    
+#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) 
+    rec_asm(x, rvec, key);
+#endif
+}
+
+
+CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction)
+{ // Error sampling
+    (void) seed; (void) nonce; (void) StreamOutputFunction;
+    unsigned char stream[3 * PARAMETER_N];
+    uint32_t *pstream = (uint32_t *) &stream;
+    uint32_t acc1, acc2, temp;
+    uint8_t *pacc1 = (uint8_t *) &acc1, *pacc2 = (uint8_t *) &acc2;
+    unsigned int i, j;
+
+    randombytes( stream, 3 * PARAMETER_N);
+
+#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT)
+    error_sampling_asm(stream, e);
+#else
+    for (i = 0; i < PARAMETER_N / 4; i++) {
+        acc1 = 0;
+        acc2 = 0;
+        for (j = 0; j < 8; j++) {
+            acc1 += (pstream[i] >> j) & 0x01010101;
+            acc2 += (pstream[i + PARAMETER_N / 4] >> j) & 0x01010101;
+        }
+        for (j = 0; j < 4; j++) {
+            temp = pstream[i + 2 * PARAMETER_N / 4] >> j;
+            acc1 += temp & 0x01010101;
+            acc2 += (temp >> 4) & 0x01010101;
+        }
+        e[2 * i] = pacc1[0] - pacc1[1];
+        e[2 * i + 1] = pacc1[2] - pacc1[3];
+        e[2 * i + PARAMETER_N / 2] = pacc2[0] - pacc2[1];
+        e[2 * i + PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3];
+    }
+#endif
+
+    return CRYPTO_MSRLN_SUCCESS;    
+}
+
+
+CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction)
+{ // Generation of parameter a
+    (void)ExtendableOutputFunction;
+    unsigned int pos = 0, ctr = 0;
+    uint16_t val;
+    unsigned int nblocks = 16;
+    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
+    //Keccak_HashInstance ks;
+
+    uint64_t state[SHA3_STATESIZE] = {0};
+    shake128_absorb(state, seed, SEED_BYTES);
+    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+
+    /*#ifdef _WIN32
+        SHAKE128_InitAbsorb( &ks, seed, SEED_BYTES );
+        KECCAK_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 );
+    #else 
+        Keccak_HashInitialize_SHAKE128(&ks);
+        Keccak_HashUpdate( &ks, seed, SEED_BYTES * 8 );
+        Keccak_HashFinal( &ks, seed );
+        Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
+    //#endif
+    */
+    while (ctr < PARAMETER_N) {
+        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
+        if (val < PARAMETER_Q) {
+            a[ctr++] = val;
+        }
+        pos += 2;
+        if (pos > SHAKE128_RATE * nblocks - 2) {
+            nblocks = 1;
+          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
+            pos = 0;
+        }
+    }
+
+    return CRYPTO_MSRLN_SUCCESS;    
+}
+
+
+CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto)
+{ // Alice's key generation  
+  // It produces a private key SecretKeyA and computes the public key PublicKeyA.
+  // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+  //          the public key PublicKeyA that occupies 1824 bytes
+  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+    uint32_t a[PARAMETER_N];
+    int32_t e[PARAMETER_N];
+    unsigned char seed[SEED_BYTES];
+    unsigned char error_seed[ERROR_SEED_BYTES];
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
+
+    Status = randombytes( seed, SEED_BYTES);
+
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        return Status;
+    }   
+
+    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+
+    Status = get_error(SecretKeyA, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);   
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+    NTT_CT_std2rev_12289(SecretKeyA, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    smul(e, 3, PARAMETER_N);
+
+    pmuladd((int32_t*)a, SecretKeyA, e, (int32_t*)a, PARAMETER_N); 
+    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
+    encode_A(a, seed, PublicKeyA);
+    
+cleanup:
+    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
+
+    return Status;
+}
+
+
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto)
+{ // Bob's key generation and shared secret computation  
+  // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
+  // the shared secret SharedSecretB.
+  // Input:   Alice's public key PublicKeyA that consists of 1824 bytes
+  // Outputs: the public key PublicKeyB that occupies 2048 bytes.
+  //          the 256-bit shared secret SharedSecretB.
+  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+    uint32_t pk_A[PARAMETER_N], a[PARAMETER_N], v[PARAMETER_N], r[PARAMETER_N];
+    int32_t sk_B[PARAMETER_N], e[PARAMETER_N];
+    unsigned char seed[SEED_BYTES], error_seed[ERROR_SEED_BYTES];
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN;
+
+    decode_A(PublicKeyA, pk_A, seed);
+
+    Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction);
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+
+    Status = get_error(sk_B, error_seed, 0, pLatticeCrypto->StreamOutputFunction);  
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }
+    Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction);
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }   
+    NTT_CT_std2rev_12289(sk_B, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    smul(e, 3, PARAMETER_N);
+
+    pmuladd((int32_t*)a, sk_B, e, (int32_t*)a, PARAMETER_N); 
+    correction((int32_t*)a, PARAMETER_Q, PARAMETER_N);
+     
+    Status = get_error(e, error_seed, 2, pLatticeCrypto->StreamOutputFunction);  
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }   
+    NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N);
+    smul(e, 81, PARAMETER_N);
+    
+    pmuladd((int32_t*)pk_A, sk_B, e, (int32_t*)v, PARAMETER_N);    
+    INTT_GS_rev2std_12289((int32_t*)v, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
+    two_reduce12289((int32_t*)v, PARAMETER_N);
+#if defined(GENERIC_IMPLEMENTATION)
+    correction((int32_t*)v, PARAMETER_Q, PARAMETER_N); 
+#endif
+
+    Status = HelpRec(v, r, error_seed, 3, pLatticeCrypto->StreamOutputFunction); 
+    if (Status != CRYPTO_MSRLN_SUCCESS) {
+        goto cleanup;
+    }   
+    Rec(v, r, SharedSecretB);
+    encode_B(a, r, PublicKeyB);
+    
+cleanup:
+    clear_words((void*)sk_B, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES));
+    clear_words((void*)a, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)v, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
+
+    return Status;
+}
+
+
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA)
+{ // Alice's shared secret computation  
+  // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
+  // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
+  //         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+  // Output: the 256-bit shared secret SharedSecretA.
+  // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+    uint32_t u[PARAMETER_N], r[PARAMETER_N];
+    CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS;
+
+    decode_B(PublicKeyB, u, r);
+    
+    pmul(SecretKeyA, (int32_t*)u, (int32_t*)u, PARAMETER_N);       
+    INTT_GS_rev2std_12289((int32_t*)u, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N);
+    two_reduce12289((int32_t*)u, PARAMETER_N);
+#if defined(GENERIC_IMPLEMENTATION)
+    correction((int32_t*)u, PARAMETER_Q, PARAMETER_N); 
+#endif
+
+    Rec(u, r, SharedSecretA);
+    
+// Cleanup
+    clear_words((void*)u, NBYTES_TO_NWORDS(4*PARAMETER_N));
+    clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N));
+
+    return Status;
+}
diff --git a/dap-sdk/crypto/src/msrln/makefile b/dap-sdk/crypto/src/msrln/makefile
index ab4cb800cc..d017a0e2bb 100755
--- a/dap-sdk/crypto/src/msrln/makefile
+++ b/dap-sdk/crypto/src/msrln/makefile
@@ -1,94 +1,94 @@
-####  Makefile for compilation on Linux  ####
-
-OPT=-O3     # Optimization option by default
-
-ifeq "$(CC)" "gcc"
-    COMPILER=gcc
-else ifeq "$(CC)" "clang"
-    COMPILER=clang
-endif
-
-ifeq "$(ARCH)" "x64"
-    ARCHITECTURE=_AMD64_
-else ifeq "$(ARCH)" "x86"
-    ARCHITECTURE=_X86_
-else ifeq "$(ARCH)" "ARM"
-    ARCHITECTURE=_ARM_
-endif
-
-ADDITIONAL_SETTINGS=
-ifeq "$(SET)" "EXTENDED"
-    ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native
-endif
-
-ifeq "$(ASM)" "TRUE"
-    USE_ASM=-D _ASM_
-endif
-
-ifeq "$(GENERIC)" "TRUE"
-    USE_GENERIC=-D _GENERIC_
-endif
-
-ifeq "$(AVX2)" "TRUE"
-    USE_AVX2=-D _AVX2_
-    SIMD=-mavx2
-endif
-
-ifeq "$(ARCH)" "ARM"
-    ARM_SETTING=-lrt
-endif
-
-cc=$(COMPILER)
-CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC)
-LDFLAGS=
-ifeq "$(GENERIC)" "TRUE"
-    OTHER_OBJECTS=ntt.o
-else
-ifeq "$(ASM)" "TRUE"
-    OTHER_OBJECTS=ntt_x64.o consts.o
-    ASM_OBJECTS=ntt_x64_asm.o error_asm.o
-endif 
-endif
-OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS)
-OBJECTS_TEST=tests.o test_extras.o $(OBJECTS)
-OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST)
-
-test: $(OBJECTS_TEST)
-	$(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING)
-
-kex.o: kex.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) kex.c
-
-random.o: random.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) random.c
-
-ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) ntt_constants.c
-    
-ifeq "$(GENERIC)" "TRUE"
-    ntt.o: generic/ntt.c LatticeCrypto_priv.h
-	    $(CC) $(CFLAGS) generic/ntt.c 
-else   
-ifeq "$(ASM)" "TRUE"
-    ntt_x64.o: AMD64/ntt_x64.c
-	    $(CC) $(CFLAGS) AMD64/ntt_x64.c
-    ntt_x64_asm.o: AMD64/ntt_x64_asm.S
-	    $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S
-    error_asm.o: AMD64/error_asm.S
-	    $(CC) $(CFLAGS) AMD64/error_asm.S
-    consts.o: AMD64/consts.c
-	    $(CC) $(CFLAGS) AMD64/consts.c
-endif
-endif
-
-test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) tests/test_extras.c
-
-tests.o: tests/tests.c LatticeCrypto_priv.h
-	$(CC) $(CFLAGS) tests/tests.c
-
-.PHONY: clean
-
-clean:
-	rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL)
-
+####  Makefile for compilation on Linux  ####
+
+OPT=-O3     # Optimization option by default
+
+ifeq "$(CC)" "gcc"
+    COMPILER=gcc
+else ifeq "$(CC)" "clang"
+    COMPILER=clang
+endif
+
+ifeq "$(ARCH)" "x64"
+    ARCHITECTURE=_AMD64_
+else ifeq "$(ARCH)" "x86"
+    ARCHITECTURE=_X86_
+else ifeq "$(ARCH)" "ARM"
+    ARCHITECTURE=_ARM_
+endif
+
+ADDITIONAL_SETTINGS=
+ifeq "$(SET)" "EXTENDED"
+    ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native
+endif
+
+ifeq "$(ASM)" "TRUE"
+    USE_ASM=-D _ASM_
+endif
+
+ifeq "$(GENERIC)" "TRUE"
+    USE_GENERIC=-D _GENERIC_
+endif
+
+ifeq "$(AVX2)" "TRUE"
+    USE_AVX2=-D _AVX2_
+    SIMD=-mavx2
+endif
+
+ifeq "$(ARCH)" "ARM"
+    ARM_SETTING=-lrt
+endif
+
+cc=$(COMPILER)
+CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC)
+LDFLAGS=
+ifeq "$(GENERIC)" "TRUE"
+    OTHER_OBJECTS=ntt.o
+else
+ifeq "$(ASM)" "TRUE"
+    OTHER_OBJECTS=ntt_x64.o consts.o
+    ASM_OBJECTS=ntt_x64_asm.o error_asm.o
+endif 
+endif
+OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS)
+OBJECTS_TEST=tests.o test_extras.o $(OBJECTS)
+OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST)
+
+test: $(OBJECTS_TEST)
+	$(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING)
+
+kex.o: kex.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) kex.c
+
+random.o: random.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) random.c
+
+ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) ntt_constants.c
+    
+ifeq "$(GENERIC)" "TRUE"
+    ntt.o: generic/ntt.c LatticeCrypto_priv.h
+	    $(CC) $(CFLAGS) generic/ntt.c 
+else   
+ifeq "$(ASM)" "TRUE"
+    ntt_x64.o: AMD64/ntt_x64.c
+	    $(CC) $(CFLAGS) AMD64/ntt_x64.c
+    ntt_x64_asm.o: AMD64/ntt_x64_asm.S
+	    $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S
+    error_asm.o: AMD64/error_asm.S
+	    $(CC) $(CFLAGS) AMD64/error_asm.S
+    consts.o: AMD64/consts.c
+	    $(CC) $(CFLAGS) AMD64/consts.c
+endif
+endif
+
+test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) tests/test_extras.c
+
+tests.o: tests/tests.c LatticeCrypto_priv.h
+	$(CC) $(CFLAGS) tests/tests.c
+
+.PHONY: clean
+
+clean:
+	rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL)
+
diff --git a/dap-sdk/crypto/src/msrln/msrln.h b/dap-sdk/crypto/src/msrln/msrln.h
index 5b54822603..b789d0209a 100755
--- a/dap-sdk/crypto/src/msrln/msrln.h
+++ b/dap-sdk/crypto/src/msrln/msrln.h
@@ -1,136 +1,136 @@
-#ifndef __MSRLN_H__
-#define __MSRLN_H__
-
-
-// For C++
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include "dap_crypto_common.h"
-
-// Definitions of the error-handling type and error codes
-
-typedef enum {
-    CRYPTO_MSRLN_SUCCESS,                          // 0x00
-    CRYPTO_MSRLN_ERROR,                            // 0x01
-    CRYPTO_MSRLN_ERROR_DURING_TEST,                // 0x02
-    CRYPTO_MSRLN_ERROR_UNKNOWN,                    // 0x03
-    CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED,            // 0x04
-    CRYPTO_MSRLN_ERROR_NO_MEMORY,                  // 0x05
-    CRYPTO_MSRLN_ERROR_INVALID_PARAMETER,          // 0x06
-    CRYPTO_MSRLN_ERROR_SHARED_KEY,                 // 0x07
-    CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS,        // 0x08
-    CRYPTO_MSRLN_ERROR_END_OF_LIST
-} CRYPTO_MSRLN_STATUS;
-
-#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_MSRLN_ERROR_END_OF_LIST)
-
-
-// Definitions of the error messages
-// NOTE: they must match the error codes above
-
-#define CRYPTO_MSG_SUCCESS                                "CRYPTO_SUCCESS"
-#define CRYPTO_MSG_ERROR                                  "CRYPTO_ERROR"
-#define CRYPTO_MSG_ERROR_DURING_TEST                      "CRYPTO_ERROR_DURING_TEST"
-#define CRYPTO_MSG_ERROR_UNKNOWN                          "CRYPTO_ERROR_UNKNOWN"
-#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED                  "CRYPTO_ERROR_NOT_IMPLEMENTED"
-#define CRYPTO_MSG_ERROR_NO_MEMORY                        "CRYPTO_ERROR_NO_MEMORY"
-#define CRYPTO_MSG_ERROR_INVALID_PARAMETER                "CRYPTO_ERROR_INVALID_PARAMETER"
-#define CRYPTO_MSG_ERROR_SHARED_KEY                       "CRYPTO_ERROR_SHARED_KEY"
-#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS              "CRYPTO_ERROR_TOO_MANY_ITERATIONS"                                                            
-
-
-// Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array"
-typedef CRYPTO_MSRLN_STATUS (*RandomBytes)(unsigned char* random_array, unsigned int nbytes);
-
-// Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array"
-typedef CRYPTO_MSRLN_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array);
-
-// Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array"
-typedef CRYPTO_MSRLN_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
-
-
-// Basic key-exchange constants  
-#define MSRLN_PKA_BYTES           1824      // Alice's public key size
-#define MSRLN_PKB_BYTES           2048      // Bob's public key size
-#define MSRLN_SHAREDKEY_BYTES     32        // Shared key size
-
-
-// This data struct is initialized during setup with user-provided functions
-typedef struct
-{
-    RandomBytes      RandomBytesFunction;               // Function providing random bytes
-    ExtendableOutput ExtendableOutputFunction;          // Extendable output function
-    StreamOutput     StreamOutputFunction;              // Stream cipher function
-} LatticeCryptoStruct, *PLatticeCryptoStruct;
-
-
-/******************** Function prototypes *******************/
-/*********************** Auxiliary API **********************/ 
-
-// Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
-extern void clear_words(void* mem, digit_t nwords);
-CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
-CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a);
-
-// Output "nbytes" of random values.
-// It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
-// The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets.
-CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction);
-
-// Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".   
-// It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
-// The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. 
-CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction);
-
-// Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
-// It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
-// The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.  
-CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction);
-
-// Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error.
-PLatticeCryptoStruct LatticeCrypto_allocate(void); 
-
-// Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
-CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction);
-
-// Output error/success message for a given CRYPTO_STATUS
-const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status);
-
-/*********************** Key exchange API ***********************/ 
-
-// Alice's key generation 
-// It produces a private key SecretKeyA and computes the public key PublicKeyA.
-// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-//          the public key PublicKeyA that occupies 1824 bytes
-// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto);
-
-// Bob's key generation and shared secret computation
-// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
-// the shared secret SharedSecretB.
-// Input:   Alice's public key PublicKeyA that consists of 1824 bytes
-// Outputs: the public key PublicKeyB that occupies 2048 bytes.
-//          the 256-bit shared secret SharedSecretB.
-// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto);
-
-// Alice's shared secret computation 
-// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
-// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
-//         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
-// Output: the 256-bit shared secret SharedSecretA.
-// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
-CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif
+#ifndef __MSRLN_H__
+#define __MSRLN_H__
+
+
+// For C++
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include "dap_crypto_common.h"
+
+// Definitions of the error-handling type and error codes
+
+typedef enum {
+    CRYPTO_MSRLN_SUCCESS,                          // 0x00
+    CRYPTO_MSRLN_ERROR,                            // 0x01
+    CRYPTO_MSRLN_ERROR_DURING_TEST,                // 0x02
+    CRYPTO_MSRLN_ERROR_UNKNOWN,                    // 0x03
+    CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED,            // 0x04
+    CRYPTO_MSRLN_ERROR_NO_MEMORY,                  // 0x05
+    CRYPTO_MSRLN_ERROR_INVALID_PARAMETER,          // 0x06
+    CRYPTO_MSRLN_ERROR_SHARED_KEY,                 // 0x07
+    CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS,        // 0x08
+    CRYPTO_MSRLN_ERROR_END_OF_LIST
+} CRYPTO_MSRLN_STATUS;
+
+#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_MSRLN_ERROR_END_OF_LIST)
+
+
+// Definitions of the error messages
+// NOTE: they must match the error codes above
+
+#define CRYPTO_MSG_SUCCESS                                "CRYPTO_SUCCESS"
+#define CRYPTO_MSG_ERROR                                  "CRYPTO_ERROR"
+#define CRYPTO_MSG_ERROR_DURING_TEST                      "CRYPTO_ERROR_DURING_TEST"
+#define CRYPTO_MSG_ERROR_UNKNOWN                          "CRYPTO_ERROR_UNKNOWN"
+#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED                  "CRYPTO_ERROR_NOT_IMPLEMENTED"
+#define CRYPTO_MSG_ERROR_NO_MEMORY                        "CRYPTO_ERROR_NO_MEMORY"
+#define CRYPTO_MSG_ERROR_INVALID_PARAMETER                "CRYPTO_ERROR_INVALID_PARAMETER"
+#define CRYPTO_MSG_ERROR_SHARED_KEY                       "CRYPTO_ERROR_SHARED_KEY"
+#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS              "CRYPTO_ERROR_TOO_MANY_ITERATIONS"                                                            
+
+
+// Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array"
+typedef CRYPTO_MSRLN_STATUS (*RandomBytes)(unsigned char* random_array, unsigned int nbytes);
+
+// Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array"
+typedef CRYPTO_MSRLN_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array);
+
+// Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array"
+typedef CRYPTO_MSRLN_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
+
+
+// Basic key-exchange constants  
+#define MSRLN_PKA_BYTES           1824      // Alice's public key size
+#define MSRLN_PKB_BYTES           2048      // Bob's public key size
+#define MSRLN_SHAREDKEY_BYTES     32        // Shared key size
+
+
+// This data struct is initialized during setup with user-provided functions
+typedef struct
+{
+    RandomBytes      RandomBytesFunction;               // Function providing random bytes
+    ExtendableOutput ExtendableOutputFunction;          // Extendable output function
+    StreamOutput     StreamOutputFunction;              // Stream cipher function
+} LatticeCryptoStruct, *PLatticeCryptoStruct;
+
+
+/******************** Function prototypes *******************/
+/*********************** Auxiliary API **********************/ 
+
+// Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
+extern void clear_words(void* mem, digit_t nwords);
+CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
+CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a);
+
+// Output "nbytes" of random values.
+// It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
+// The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets.
+CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction);
+
+// Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".   
+// It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
+// The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. 
+CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction);
+
+// Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
+// It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
+// The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.  
+CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction);
+
+// Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error.
+PLatticeCryptoStruct LatticeCrypto_allocate(void); 
+
+// Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
+CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction);
+
+// Output error/success message for a given CRYPTO_STATUS
+const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status);
+
+/*********************** Key exchange API ***********************/ 
+
+// Alice's key generation 
+// It produces a private key SecretKeyA and computes the public key PublicKeyA.
+// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+//          the public key PublicKeyA that occupies 1824 bytes
+// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto);
+
+// Bob's key generation and shared secret computation
+// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 
+// the shared secret SharedSecretB.
+// Input:   Alice's public key PublicKeyA that consists of 1824 bytes
+// Outputs: the public key PublicKeyB that occupies 2048 bytes.
+//          the 256-bit shared secret SharedSecretB.
+// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto);
+
+// Alice's shared secret computation 
+// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
+// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
+//         the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
+// Output: the 256-bit shared secret SharedSecretA.
+// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
+CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/dap-sdk/crypto/src/msrln/msrln.pri b/dap-sdk/crypto/src/msrln/msrln.pri
index cd4600ef3d..f42be38c96 100755
--- a/dap-sdk/crypto/src/msrln/msrln.pri
+++ b/dap-sdk/crypto/src/msrln/msrln.pri
@@ -1,6 +1,6 @@
-INCLUDEPATH += $$PWD
-
-HEADERS += $$PWD/msrln.h \
-
-SOURCES += $$PWD/kex.c \
-           $$PWD/random.c \
+INCLUDEPATH += $$PWD
+
+HEADERS += $$PWD/msrln.h \
+
+SOURCES += $$PWD/kex.c \
+           $$PWD/random.c \
diff --git a/dap-sdk/crypto/src/msrln/msrln_priv.h b/dap-sdk/crypto/src/msrln/msrln_priv.h
index fdaae50ad3..cc1f198010 100755
--- a/dap-sdk/crypto/src/msrln/msrln_priv.h
+++ b/dap-sdk/crypto/src/msrln/msrln_priv.h
@@ -1,114 +1,114 @@
-#ifndef __MSRLN_priv_H__
-#define __MSRLN_priv_H__
-
-// For C++
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "msrln.h"
-
-// Basic constants            
-#define PARAMETER_N         1024 
-#define PARAMETER_Q         12289 
-#define SEED_BYTES          256/8
-#define ERROR_SEED_BYTES    256/8
-#define NONCE_SEED_BYTES    256/8
-#define PARAMETER_Q4        3073 
-#define PARAMETER_3Q4       9217 
-#define PARAMETER_5Q4       15362 
-#define PARAMETER_7Q4       21506 
-#define PARAMETER_Q2        6145 
-#define PARAMETER_3Q2       18434
-    
-
-// Macro definitions
-
-#define NBITS_TO_NWORDS(nbits)      (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8))    // Conversion macro from number of bits to number of computer words
-#define NBYTES_TO_NWORDS(nbytes)    (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t))           // Conversion macro from number of bytes to number of computer words
-
-// Macro to avoid compiler warnings when detecting unreferenced parameters
-#ifndef UNREFERENCED_PARAMETER
-#define UNREFERENCED_PARAMETER(PAR) ((void)PAR)
-#endif
-
-
-/******************** Function prototypes *******************/
-/******************* Polynomial functions *******************/
-
-// Forward NTT
-void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N);
-void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N);
-
-// Inverse NTT
-void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
-void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
-
-// Reduction modulo q
-int32_t reduce12289(int64_t a);
-
-// Two merged reductions modulo q
-int32_t reduce12289_2x(int64_t a);
-
-// Two consecutive reductions modulo q
-void two_reduce12289(int32_t* a, unsigned int N);
-void two_reduce12289_asm(int32_t* a, unsigned int N);
-
-// Correction modulo q
-void correction(int32_t* a, int32_t p, unsigned int N);
-
-// Component-wise multiplication
-void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
-void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
-
-// Component-wise multiplication and addition
-void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
-void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
-
-// Component-wise multiplication with scalar
-void smul(int32_t* a, int32_t scalar, unsigned int N);
-
-/******************* Key exchange functions *******************/
-
-// Alice's message encoding
-void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m);
-
-// Alice's message decoding
-void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); 
-    
-// Bob's message encoding
-void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m);
-    
-// Bob's message decoding
-void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec);
-
-// Partial message encoding/decoding (assembly optimized) 
-void encode_asm(const uint32_t* pk, unsigned char* m);
-void decode_asm(const unsigned char* m, uint32_t *pk);
-
-// Reconciliation helper
-CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
-
-// Partial reconciliation helper (assembly optimized)        
-void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits);
-
-// Reconciliation
-void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
-void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
-
-// Error sampling
-CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
-
-// Partial error sampling (assembly optimized)        
-void error_sampling_asm(unsigned char* stream, int32_t* e);
-
-// Generation of parameter a
-CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif
+#ifndef __MSRLN_priv_H__
+#define __MSRLN_priv_H__
+
+// For C++
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "msrln.h"
+
+// Basic constants            
+#define PARAMETER_N         1024 
+#define PARAMETER_Q         12289 
+#define SEED_BYTES          256/8
+#define ERROR_SEED_BYTES    256/8
+#define NONCE_SEED_BYTES    256/8
+#define PARAMETER_Q4        3073 
+#define PARAMETER_3Q4       9217 
+#define PARAMETER_5Q4       15362 
+#define PARAMETER_7Q4       21506 
+#define PARAMETER_Q2        6145 
+#define PARAMETER_3Q2       18434
+    
+
+// Macro definitions
+
+#define NBITS_TO_NWORDS(nbits)      (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8))    // Conversion macro from number of bits to number of computer words
+#define NBYTES_TO_NWORDS(nbytes)    (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t))           // Conversion macro from number of bytes to number of computer words
+
+// Macro to avoid compiler warnings when detecting unreferenced parameters
+#ifndef UNREFERENCED_PARAMETER
+#define UNREFERENCED_PARAMETER(PAR) ((void)PAR)
+#endif
+
+
+/******************** Function prototypes *******************/
+/******************* Polynomial functions *******************/
+
+// Forward NTT
+void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N);
+void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N);
+
+// Inverse NTT
+void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
+void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
+
+// Reduction modulo q
+int32_t reduce12289(int64_t a);
+
+// Two merged reductions modulo q
+int32_t reduce12289_2x(int64_t a);
+
+// Two consecutive reductions modulo q
+void two_reduce12289(int32_t* a, unsigned int N);
+void two_reduce12289_asm(int32_t* a, unsigned int N);
+
+// Correction modulo q
+void correction(int32_t* a, int32_t p, unsigned int N);
+
+// Component-wise multiplication
+void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
+void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
+
+// Component-wise multiplication and addition
+void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
+void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
+
+// Component-wise multiplication with scalar
+void smul(int32_t* a, int32_t scalar, unsigned int N);
+
+/******************* Key exchange functions *******************/
+
+// Alice's message encoding
+void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m);
+
+// Alice's message decoding
+void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); 
+    
+// Bob's message encoding
+void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m);
+    
+// Bob's message decoding
+void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec);
+
+// Partial message encoding/decoding (assembly optimized) 
+void encode_asm(const uint32_t* pk, unsigned char* m);
+void decode_asm(const unsigned char* m, uint32_t *pk);
+
+// Reconciliation helper
+CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
+
+// Partial reconciliation helper (assembly optimized)        
+void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits);
+
+// Reconciliation
+void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
+void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
+
+// Error sampling
+CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
+
+// Partial error sampling (assembly optimized)        
+void error_sampling_asm(unsigned char* stream, int32_t* e);
+
+// Generation of parameter a
+CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/dap-sdk/crypto/src/msrln/random.c b/dap-sdk/crypto/src/msrln/random.c
index ab2b129f84..eaea6a1170 100755
--- a/dap-sdk/crypto/src/msrln/random.c
+++ b/dap-sdk/crypto/src/msrln/random.c
@@ -1,90 +1,90 @@
-#include "msrln_priv.h"
-
-//#include "KeccakHash.h"
-//#include "SimpleFIPS202.h"
-
-#define LOG_TAG "RANDOM"
-
-CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a)
-{
-    // Generation of parameter a
-    unsigned int pos = 0, ctr = 0;
-    uint16_t val;
-    unsigned int nblocks = 16;
-    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
-    //Keccak_HashInstance ks;
-
-    uint64_t state[SHA3_STATESIZE];
-    shake128_absorb(state, seed, seed_nbytes);
-    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-
-    /*Keccak_HashInitialize_SHAKE128(&ks);
-    Keccak_HashUpdate( &ks, seed, seed_nbytes * 8 );
-    Keccak_HashFinal( &ks, seed );
-    Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );*/
-
-    while (ctr < array_ndigits) {
-        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
-        if (val < PARAMETER_Q) {
-            a[ctr++] = val;
-        }
-        pos += 2;
-        if (pos > SHAKE128_RATE * nblocks - 2) {
-            nblocks = 1;
-          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
-//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
-            pos = 0;
-        }
-    }
-    return CRYPTO_MSRLN_SUCCESS;
-}
-
-CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array)
-{
-    UNREFERENCED_PARAMETER(seed);
-    UNREFERENCED_PARAMETER(seed_nbytes);
-    UNREFERENCED_PARAMETER(nonce);
-    UNREFERENCED_PARAMETER(nonce_nbytes);
-
-    randombytes( stream_array, array_nbytes);
-
-    return CRYPTO_MSRLN_SUCCESS;
-}
-
-CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction)
-{ // Output "nbytes" of random values.
-  // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
-  // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets.
-
-    if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) {
-        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
-    }    
-    
-    return (RandomBytesFunction)(random_array, nbytes);
-}
-
-
-CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction)
-{ // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".
-  // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
-  // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits.
-
-    if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) {
-        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
-    }    
-    
-    return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array);
-}
-
-
-CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction)
-{ // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
-  // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
-  // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.
-
-    if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) {
-        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
-    }    
-    
-    return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array);
-}
+#include "msrln_priv.h"
+
+//#include "KeccakHash.h"
+//#include "SimpleFIPS202.h"
+
+#define LOG_TAG "RANDOM"
+
+CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a)
+{
+    // Generation of parameter a
+    unsigned int pos = 0, ctr = 0;
+    uint16_t val;
+    unsigned int nblocks = 16;
+    uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init
+    //Keccak_HashInstance ks;
+
+    uint64_t state[SHA3_STATESIZE];
+    shake128_absorb(state, seed, seed_nbytes);
+    shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+
+    /*Keccak_HashInitialize_SHAKE128(&ks);
+    Keccak_HashUpdate( &ks, seed, seed_nbytes * 8 );
+    Keccak_HashFinal( &ks, seed );
+    Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );*/
+
+    while (ctr < array_ndigits) {
+        val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff;
+        if (val < PARAMETER_Q) {
+            a[ctr++] = val;
+        }
+        pos += 2;
+        if (pos > SHAKE128_RATE * nblocks - 2) {
+            nblocks = 1;
+          shake128_squeezeblocks((unsigned char *) buf, nblocks, state);
+//            Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );
+            pos = 0;
+        }
+    }
+    return CRYPTO_MSRLN_SUCCESS;
+}
+
+CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array)
+{
+    UNREFERENCED_PARAMETER(seed);
+    UNREFERENCED_PARAMETER(seed_nbytes);
+    UNREFERENCED_PARAMETER(nonce);
+    UNREFERENCED_PARAMETER(nonce_nbytes);
+
+    randombytes( stream_array, array_nbytes);
+
+    return CRYPTO_MSRLN_SUCCESS;
+}
+
+CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction)
+{ // Output "nbytes" of random values.
+  // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
+  // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets.
+
+    if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) {
+        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
+    }    
+    
+    return (RandomBytesFunction)(random_array, nbytes);
+}
+
+
+CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction)
+{ // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".
+  // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
+  // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits.
+
+    if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) {
+        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
+    }    
+    
+    return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array);
+}
+
+
+CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction)
+{ // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".  
+  // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
+  // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.
+
+    if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) {
+        return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER;
+    }    
+    
+    return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array);
+}
diff --git a/dap-sdk/crypto/src/sig_picnic/picnic_impl.c b/dap-sdk/crypto/src/sig_picnic/picnic_impl.c
index 2a0e25ffe8..4e9b1a329b 100755
--- a/dap-sdk/crypto/src/sig_picnic/picnic_impl.c
+++ b/dap-sdk/crypto/src/sig_picnic/picnic_impl.c
@@ -1,998 +1,998 @@
-/*! @file picnic_impl.c
- *  @brief This is the main file of the signature scheme. All of the LowMC MPC
- *  code is here as well as lower-level versions of sign and verify that are
- *  called by the signature API.
- *
- *  This file is part of the reference implementation of the Picnic signature scheme.
- *  See the accompanying documentation for complete details.
- *
- *  The code is provided under the MIT license, see LICENSE for
- *  more details.
- *  SPDX-License-Identifier: MIT
- */
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#if defined (__WIN32)
-	#include <windows.h>
-	#include <bcrypt.h>
-#elif defined (__APPLE__)
-    #include "macos_specific_endian.h"
-#else
-    #include <endian.h>
-#endif
-
-#include "picnic_impl.h"
-#include "picnic.h"
-#include "platform.h"
-#include "lowmc_constants.h"
-#include "hash.h"
-#include "picnic_types.h"
-#include "dap_common.h"
-
-
-#define VIEW_OUTPUTS(i, j) viewOutputs[(i) * 3 + (j)]
-
-
-/* Helper functions */
-uint16_t toLittleEndian(uint16_t x)
-{
-#if defined(__WIN32)
-    #if BYTE_ORDER == LITTLE_ENDIAN
-		return x;
-	#else
-		return __builtin_bswap16(x);
-    #endif
-#else
-    return htole16(x);
-#endif
-}
-
-/* Get one bit from a byte array */
-uint8_t getBit(const uint8_t* array, uint32_t bitNumber)
-{
-    return (array[bitNumber / 8] >> (7 - (bitNumber % 8))) & 0x01;
-}
-
-/* Get one bit from a 32-bit int array */
-uint8_t getBitFromWordArray(const uint32_t* array, uint32_t bitNumber)
-{
-    return getBit((uint8_t*)array, bitNumber);
-}
-
-/* Set a specific bit in a byte array to a given value */
-void setBit(uint8_t* bytes, uint32_t bitNumber, uint8_t val)
-{
-    bytes[bitNumber / 8] = (bytes[bitNumber >> 3]
-                            & ~(1 << (7 - (bitNumber % 8)))) | (val << (7 - (bitNumber % 8)));
-}
-
-/* Set a specific bit in a byte array to a given value */
-void setBitInWordArray(uint32_t* array, uint32_t bitNumber, uint8_t val)
-{
-    setBit((uint8_t*)array, bitNumber, val);
-}
-
-static uint8_t parity(uint32_t* data, size_t len)
-{
-    uint32_t x = data[0];
-    size_t i;
-    for (i = 1; i < len; i++) {
-        x ^= data[i];
-    }
-
-    /* Compute parity of x using code from Section 5-2 of
-     * H.S. Warren, *Hacker's Delight*, Pearson Education, 2003.
-     * http://www.hackersdelight.org/hdcodetxt/parity.c.txt
-     */
-    uint32_t y = x ^ (x >> 1);
-    y ^= (y >> 2);
-    y ^= (y >> 4);
-    y ^= (y >> 8);
-    y ^= (y >> 16);
-    return y & 1;
-}
-
-uint32_t numBytes(uint32_t numBits)
-{
-    return (numBits == 0) ? 0 : ((numBits - 1) / 8 + 1);
-}
-
-static void xor_array(const uint32_t * in1, const uint32_t * in2, uint32_t * out, uint32_t numBytes)
-{
-    uint32_t i;
-    for (i = 0; i < numBytes; i++) {
-        out[i] = in1[i] ^ in2[i];
-    }
-}
-
-static void matrix_mul(
-    uint32_t* state,
-    const uint32_t* matrix,
-    uint32_t* output,
-    paramset_t* params)
-{
-    // Use temp to correctly handle the case when state = output
-    uint32_t prod[LOWMC_MAX_STATE_SIZE];
-    uint32_t temp[LOWMC_MAX_STATE_SIZE];
-
-    uint32_t i, j;
-    for (i = 0; i < params->stateSizeBits; i++) {
-        for (j = 0; j < params->stateSizeWords; j++) {
-            size_t index = i * params->stateSizeWords + j;
-            prod[j] = (state[j] & matrix[index]);
-        }
-        setBit((uint8_t*)temp, i, parity(&prod[0], params->stateSizeWords));
-
-    }
-    memcpy(output, &temp, params->stateSizeWords * sizeof(uint32_t));
-}
-
-static void substitution(uint32_t* state, paramset_t* params)
-{
-    uint32_t i;
-    for (i = 0; i < params->numSboxes * 3; i += 3) {
-        uint8_t a = getBitFromWordArray(state, i + 2);
-        uint8_t b = getBitFromWordArray(state, i + 1);
-        uint8_t c = getBitFromWordArray(state, i);
-
-        setBitInWordArray(state, i + 2, a ^ (b & c));
-        setBitInWordArray(state, i + 1, a ^ b ^ (a & c));
-        setBitInWordArray(state, i, a ^ b ^ c ^ (a & b));
-    }
-}
-
-void LowMCEnc(const uint32_t* plaintext, uint32_t* output, uint32_t* key, paramset_t* params)
-{
-    uint32_t roundKey[LOWMC_MAX_STATE_SIZE / sizeof(uint32_t)];
-
-    if (plaintext != output) {
-        /* output will hold the intermediate state */
-        memcpy(output, plaintext, params->stateSizeBytes);
-    }
-
-    matrix_mul(key, KMatrix(0, params), roundKey, params);
-    xor_array(output, roundKey, output, params->stateSizeWords);
-
-    uint32_t r;
-    for (r = 1; r <= params->numRounds; r++) {
-        matrix_mul(key, KMatrix(r, params), roundKey, params);
-        substitution(output, params);
-        matrix_mul(output, LMatrix(r - 1, params), output, params);
-        xor_array(output, RConstant(r - 1, params), output, params->stateSizeWords);
-        xor_array(output, roundKey, output, params->stateSizeWords);
-    }
-
-}
-
-bool createRandomTape(const uint8_t* seed, uint8_t* tape,
-                      uint32_t tapeLengthBytes, paramset_t* params)
-{
-    HashInstance ctx;
-
-    if (tapeLengthBytes < params->digestSizeBytes) {
-        return false;
-    }
-
-    /* Hash the seed and a constant, store the result in tape. */
-    HashInit(&ctx, params, HASH_PREFIX_2);
-    HashUpdate(&ctx, seed, params->seedSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, tape, params->digestSizeBytes);
-
-    /* Expand the hashed seed and output length to create the tape. */
-    HashInit(&ctx, params, HASH_PREFIX_NONE);
-    HashUpdate(&ctx, tape, params->digestSizeBytes);
-    uint16_t outputBytesLE = toLittleEndian(tapeLengthBytes);
-    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, tape, tapeLengthBytes);
-
-    return true;
-}
-
-void mpc_xor(uint32_t* state[3], uint32_t* in[3], uint32_t len, int players)
-{
-    uint8_t i;
-    for (i = 0; i < players; i++) {
-        xor_array(state[i], in[i], state[i], len);
-    }
-}
-
-/* Compute the XOR of in with the first state vectors. */
-void mpc_xor_constant(uint32_t* state[3], const uint32_t* in, uint32_t len)
-{
-    xor_array(state[0], in, state[0], len);
-}
-
-void mpc_xor_constant_verify(uint32_t* state[2], const uint32_t* in, uint32_t len, uint8_t challenge)
-{
-    /* During verify, where the first share is stored in state depends on the challenge */
-    if (challenge == 0) {
-        xor_array(state[0], in, state[0], len);
-    }
-    else if (challenge == 2) {
-        xor_array(state[1], in, state[1], len);
-    }
-}
-
-
-void Commit(const uint8_t* seed, const view_t view,
-            uint8_t* hash, paramset_t* params)
-{
-    HashInstance ctx;
-
-    /* Hash the seed, store result in `hash` */
-    HashInit(&ctx, params, HASH_PREFIX_4);
-    HashUpdate(&ctx, seed, params->seedSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, hash, params->digestSizeBytes);
-
-    /* Compute H_0(H_4(seed), view) */
-    HashInit(&ctx, params, HASH_PREFIX_0);
-    HashUpdate(&ctx, hash, params->digestSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)view.inputShare, params->stateSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)view.communicatedBits, params->andSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)view.outputShare, params->stateSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, hash, params->digestSizeBytes);
-}
-
-/* This is the random "permuatation" function G for Unruh's transform */
-void G(uint8_t viewNumber, const uint8_t* seed, view_t* view, uint8_t* output, paramset_t* params)
-{
-    HashInstance ctx;
-    uint16_t outputBytes = params->seedSizeBytes + params->andSizeBytes;
-
-    /* Hash the seed with H_5, store digest in output */
-    HashInit(&ctx, params, HASH_PREFIX_5);
-    HashUpdate(&ctx, seed, params->seedSizeBytes);
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, output, params->digestSizeBytes);
-
-    /* Hash H_5(seed), the view, and the length */
-    HashInit(&ctx, params, HASH_PREFIX_NONE);
-    HashUpdate(&ctx, output, params->digestSizeBytes);
-    if (viewNumber == 2) {
-        HashUpdate(&ctx, (uint8_t*)view->inputShare, params->stateSizeBytes);
-        outputBytes += (uint16_t)params->stateSizeBytes;
-    }
-    HashUpdate(&ctx, view->communicatedBits, params->andSizeBytes);
-
-    uint16_t outputBytesLE = toLittleEndian(outputBytes);
-    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, output, outputBytes);
-}
-
-void setChallenge(uint8_t* challenge, size_t round, uint8_t trit)
-{
-    /* challenge must have length numBytes(numZKBRounds*2)
-     * 0 <= index < numZKBRounds
-     * trit must be in {0,1,2} */
-    uint32_t roundU32 = (uint32_t)round;
-
-    setBit(challenge, 2 * roundU32, trit & 1);
-    setBit(challenge, 2 * roundU32 + 1, (trit >> 1) & 1);
-}
-
-uint8_t getChallenge(const uint8_t* challenge, size_t round)
-{
-    uint32_t roundU32 = (uint32_t)round;
-
-    return (getBit(challenge, 2 * roundU32 + 1) << 1) | getBit(challenge, 2 * roundU32);
-}
-
-void H3(const uint32_t* circuitOutput, const uint32_t* plaintext, uint32_t** viewOutputs,
-        commitments_t* as,
-        uint8_t* challengeBits, const uint8_t* message, size_t messageByteLength,
-        g_commitments_t* gs, paramset_t* params)
-{
-    uint8_t* hash = malloc(params->digestSizeBytes);
-
-    HashInstance ctx;
-
-    /* Depending on the number of rounds, we might not set part of the last
-     * byte, make sure it's always zero. */
-    challengeBits[numBytes(params->numZKBRounds * 2) - 1] = 0;
-
-    /* Hash input data */
-    HashInit(&ctx, params, HASH_PREFIX_1);
-
-    /* Hash the output share from each view */
-    uint32_t i;
-    int j;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        for (j = 0; j < 3; j++) {
-            HashUpdate(&ctx, (uint8_t*)VIEW_OUTPUTS(i, j), params->stateSizeBytes);
-        }
-    }
-
-    /* Hash all the commitments C */
-    for (i = 0; i < params->numZKBRounds; i++) {
-        for (j = 0; j < 3; j++) {
-            HashUpdate(&ctx, as[i].hashes[j], params->digestSizeBytes);
-        }
-    }
-
-    /* Hash all the commitments G */
-    if (params->transform == TRANSFORM_UR) {
-        for (i = 0; i < params->numZKBRounds; i++) {
-            for (j = 0; j < 3; j++) {
-                size_t view3UnruhLength = (j == 2) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-                HashUpdate(&ctx, gs[i].G[j], view3UnruhLength);
-            }
-        }
-    }
-
-    HashUpdate(&ctx, (uint8_t*)circuitOutput, params->stateSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
-    HashUpdate(&ctx, message, messageByteLength);
-
-    HashFinal(&ctx);
-    HashSqueeze(&ctx, hash, params->digestSizeBytes);
-
-    /* Convert hash to a packed string of values in {0,1,2} */
-    size_t byte_count, round = 0;
-    while (1) {
-        for (byte_count = 0; byte_count < params->digestSizeBytes; byte_count++) {
-            uint8_t byte = hash[byte_count];
-            /* iterate over each pair of bits in the byte */
-            for (j = 0; j < 8; j += 2) {
-                uint8_t bitPair = ((byte >> (6 - j)) & 0x03);
-                if (bitPair < 3) {
-                    setChallenge(challengeBits, round, bitPair);
-                    round++;
-                    if (round == params->numZKBRounds) {
-                        goto done;
-                    }
-                }
-            }
-        }
-
-        /* We need more bits; hash set hash = H_1(hash) */
-        HashInit(&ctx, params, HASH_PREFIX_1);
-        HashUpdate(&ctx, hash, params->digestSizeBytes);
-        HashFinal(&ctx);
-        HashSqueeze(&ctx, hash, params->digestSizeBytes);
-    }
-
-done:
-
-    free(hash);
-    return;
-}
-
-/* Caller must allocate the first parameter */
-void prove(proof_t* proof, uint8_t challenge, seeds_t* seeds,
-           view_t views[3], commitments_t* commitments, g_commitments_t* gs, paramset_t* params)
-{
-    if (challenge == 0) {
-        memcpy(proof->seed1, seeds->seed0, params->seedSizeBytes);
-        memcpy(proof->seed2, seeds->seed1, params->seedSizeBytes);
-    }
-    else if (challenge == 1) {
-        memcpy(proof->seed1, seeds->seed1, params->seedSizeBytes);
-        memcpy(proof->seed2, seeds->seed2, params->seedSizeBytes);
-    }
-    else if (challenge == 2) {
-        memcpy(proof->seed1, seeds->seed2, params->seedSizeBytes);
-        memcpy(proof->seed2, seeds->seed0, params->seedSizeBytes);
-    }
-    else {
-        assert(!"Invalid challenge");
-    }
-
-    if (challenge == 1 || challenge == 2) {
-        memcpy(proof->inputShare, views[2].inputShare, params->stateSizeBytes);
-    }
-    memcpy(proof->communicatedBits, views[(challenge + 1) % 3].communicatedBits, params->andSizeBytes);
-
-    memcpy(proof->view3Commitment, commitments->hashes[(challenge + 2) % 3], params->digestSizeBytes);
-    if (params->transform == TRANSFORM_UR) {
-        size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-        memcpy(proof->view3UnruhG, gs->G[(challenge + 2) % 3], view3UnruhLength);
-    }
-}
-
-void mpc_AND_verify(uint8_t in1[2], uint8_t in2[2], uint8_t out[2],
-                    randomTape_t* rand, view_t* view1, view_t* view2)
-{
-    uint8_t r[2] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos) };
-
-    out[0] = (in1[0] & in2[1]) ^ (in1[1] & in2[0]) ^ (in1[0] & in2[0]) ^ r[0] ^ r[1];
-    setBit(view1->communicatedBits, rand->pos, out[0]);
-    out[1] = getBit(view2->communicatedBits, rand->pos);
-
-    (rand->pos)++;
-}
-
-void mpc_substitution_verify(uint32_t* state[2], randomTape_t* rand, view_t* view1,
-                             view_t* view2, paramset_t* params)
-{
-    uint32_t i;
-    for (i = 0; i < params->numSboxes * 3; i += 3) {
-
-        uint8_t a[2];
-        uint8_t b[2];
-        uint8_t c[2];
-
-        uint8_t j;
-        for (j = 0; j < 2; j++) {
-            a[j] = getBitFromWordArray(state[j], i + 2);
-            b[j] = getBitFromWordArray(state[j], i + 1);
-            c[j] = getBitFromWordArray(state[j], i);
-        }
-
-        uint8_t ab[2];
-        uint8_t bc[2];
-        uint8_t ca[2];
-
-        mpc_AND_verify(a, b, ab, rand, view1, view2);
-        mpc_AND_verify(b, c, bc, rand, view1, view2);
-        mpc_AND_verify(c, a, ca, rand, view1, view2);
-
-        for (j = 0; j < 2; j++) {
-            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
-            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
-            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
-        }
-    }
-}
-
-void mpc_matrix_mul(uint32_t* state[3], const uint32_t* matrix,
-                    uint32_t* output[3], paramset_t* params, size_t players)
-{
-    uint32_t player;
-    for (player = 0; player < players; player++) {
-        matrix_mul(state[player], matrix, output[player], params);
-    }
-}
-
-void mpc_LowMC_verify(view_t* view1, view_t* view2,
-                      randomTape_t* tapes, uint32_t* tmp,
-                      const uint32_t* plaintext, paramset_t* params, uint8_t challenge)
-{
-    uint32_t* state[2];
-    uint32_t* keyShares[2];
-    uint32_t* roundKey[2];
-
-    roundKey[0] = tmp;
-    roundKey[1] = roundKey[0] + params->stateSizeWords;
-    state[0] = roundKey[1] + params->stateSizeWords;
-    state[1] = state[0] + params->stateSizeWords;
-
-    // initialize both roundkeys to 0. they are contingent
-    memset(roundKey[0], 0, 2 * params->stateSizeBytes);
-
-    uint32_t i, r;
-    for (i = 0; i < 2; i++) {
-        memset(state[i], 0x00, params->stateSizeBytes);
-    }
-    mpc_xor_constant_verify(state, plaintext, params->stateSizeWords, challenge);
-
-    keyShares[0] = view1->inputShare;
-    keyShares[1] = view2->inputShare;
-
-    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 2);
-    mpc_xor(state, roundKey, params->stateSizeWords, 2);
-
-    for (r = 1; r <= params->numRounds; ++r) {
-        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 2);
-        mpc_substitution_verify(state, tapes, view1, view2, params);
-        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 2);
-        mpc_xor_constant_verify(state, RConstant(r - 1, params), params->stateSizeWords, challenge);
-        mpc_xor(state, roundKey, params->stateSizeWords, 2);
-    }
-
-    memcpy(view1->outputShare, state[0], params->stateSizeBytes);
-    memcpy(view2->outputShare, state[1], params->stateSizeBytes);
-}
-
-void verifyProof(const proof_t* proof, view_t* view1, view_t* view2,
-                 uint8_t challenge, uint8_t* tmp,
-                 const uint32_t* plaintext, randomTape_t* tape, paramset_t* params)
-{
-    memcpy(view2->communicatedBits, proof->communicatedBits, params->andSizeBytes);
-    tape->pos = 0;
-
-    bool status = false;
-    switch (challenge) {
-    case 0:
-        // in this case, both views' inputs are derivable from the input share
-
-        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
-        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        if (!status) {
-            break;
-        }
-        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
-        break;
-
-    case 1:
-        // in this case view2's input share was already given to us explicitly as
-        // it is not computable from the seed. We just need to compute view1's input from
-        // its seed
-        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
-        status = status && createRandomTape(proof->seed2, tape->tape[1], params->andSizeBytes, params);
-        if (!status) {
-            break;
-        }
-        memcpy(view2->inputShare, proof->inputShare, params->stateSizeBytes);
-        break;
-
-    case 2:
-        // in this case view1's input share was already given to us explicitly as
-        // it is not computable from the seed. We just need to compute view2's input from
-        // its seed
-        status = createRandomTape(proof->seed1, tape->tape[0], params->andSizeBytes, params);
-        memcpy(view1->inputShare, proof->inputShare, params->stateSizeBytes);
-        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
-        if (!status) {
-            break;
-        }
-        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
-        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
-        break;
-
-    default:
-        break;
-    }
-
-    mpc_LowMC_verify(view1, view2, tape, (uint32_t*)tmp, plaintext, params, challenge);
-}
-
-int verify(signature_t* sig, const uint32_t* pubKey, const uint32_t* plaintext,
-           const uint8_t* message, size_t messageByteLength, paramset_t* params)
-{
-    commitments_t* as = allocateCommitments(params);
-    g_commitments_t* gs = allocateGCommitments(params);
-
-    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
-    const proof_t* proofs = sig->proofs;
-
-    const uint8_t* received_challengebits = sig->challengeBits;
-    int status = EXIT_SUCCESS;
-    uint8_t* computed_challengebits = NULL;
-    uint32_t* view3Slab = NULL;
-
-    uint8_t* tmp = malloc(MAX(6 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
-
-    randomTape_t* tape = (randomTape_t*)malloc(sizeof(randomTape_t));
-
-    allocateRandomTape(tape, params);
-
-    view_t* view1s = malloc(params->numZKBRounds * sizeof(view_t));
-    view_t* view2s = malloc(params->numZKBRounds * sizeof(view_t));
-
-    /* Allocate a slab of memory for the 3rd view's output in each round */
-    view3Slab = malloc(params->stateSizeBytes * params->numZKBRounds);
-    uint32_t* view3Output = view3Slab;     /* pointer into the slab to the current 3rd view */
-
-    size_t i, j;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        allocateView(&view1s[i], params);
-        allocateView(&view2s[i], params);
-
-        // last bits of communicatedBits may not be set so zero them
-        view1s[i].communicatedBits[params->andSizeBytes - 1] = 0;
-
-        verifyProof(&proofs[i], &view1s[i], &view2s[i],
-                    getChallenge(received_challengebits, i),
-                    tmp, plaintext, tape, params);
-
-        // create ordered array of commitments with order computed based on the challenge
-        // check commitments of the two opened views
-        uint8_t challenge = getChallenge(received_challengebits, i);
-        Commit(proofs[i].seed1, view1s[i], as[i].hashes[challenge], params);
-        Commit(proofs[i].seed2, view2s[i], as[i].hashes[(challenge + 1) % 3], params);
-        memcpy(as[i].hashes[(challenge + 2) % 3], proofs[i].view3Commitment, params->digestSizeBytes);
-
-        if (params->transform == TRANSFORM_UR) {
-            G(challenge, proofs[i].seed1, &view1s[i], gs[i].G[challenge], params);
-            G((challenge + 1) % 3, proofs[i].seed2, &view2s[i], gs[i].G[(challenge + 1) % 3], params);
-            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-            memcpy(gs[i].G[(challenge + 2) % 3], proofs[i].view3UnruhG, view3UnruhLength);
-        }
-
-        VIEW_OUTPUTS(i, challenge) = view1s[i].outputShare;
-        VIEW_OUTPUTS(i, (challenge + 1) % 3) = view2s[i].outputShare;
-        for (j = 0; j < params->stateSizeWords; j++) {
-            view3Output[j] = view1s[i].outputShare[j] ^ view2s[i].outputShare[j]
-                             ^ pubKey[j];
-        }
-        VIEW_OUTPUTS(i, (challenge + 2) % 3) = view3Output;
-        view3Output += params->stateSizeWords;
-    }
-
-    computed_challengebits = malloc(numBytes(2 * params->numZKBRounds));
-
-    H3(pubKey, plaintext, viewOutputs, as,
-       computed_challengebits, message, messageByteLength, gs, params);
-
-    if (computed_challengebits != NULL &&
-        memcmp(received_challengebits, computed_challengebits,
-               numBytes(2 * params->numZKBRounds)) != 0) {
-        status = EXIT_FAILURE;
-    }
-
-    free(computed_challengebits);
-    free(view3Slab);
-
-    freeCommitments(as);
-    for (i = 0; i < params->numZKBRounds; i++) {
-        freeView(&view1s[i]);
-        freeView(&view2s[i]);
-    }
-    free(view1s);
-    free(view2s);
-    free(tmp);
-    freeRandomTape(tape);
-    free(tape);
-    freeGCommitments(gs);
-    free(viewOutputs);
-
-    return status;
-}
-
-/*** Functions implementing Sign ***/
-
-void mpc_AND(uint8_t in1[3], uint8_t in2[3], uint8_t out[3], randomTape_t* rand,
-             view_t views[3])
-{
-    uint8_t r[3] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos), getBit(rand->tape[2], rand->pos) };
-
-    uint8_t i;
-    for (i = 0; i < 3; i++) {
-        out[i] = (in1[i] & in2[(i + 1) % 3]) ^ (in1[(i + 1) % 3] & in2[i])
-                 ^ (in1[i] & in2[i]) ^ r[i] ^ r[(i + 1) % 3];
-
-        setBit(views[i].communicatedBits, rand->pos, out[i]);
-    }
-
-    (rand->pos)++;
-}
-
-void mpc_substitution(uint32_t* state[3], randomTape_t* rand, view_t views[3],
-                      paramset_t* params)
-{
-    uint8_t a[3];
-    uint8_t b[3];
-    uint8_t c[3];
-
-    uint8_t ab[3];
-    uint8_t bc[3];
-    uint8_t ca[3];
-
-    uint32_t i;
-    for (i = 0; i < params->numSboxes * 3; i += 3) {
-
-        uint8_t j;
-        for (j = 0; j < 3; j++) {
-            a[j] = getBitFromWordArray(state[j], i + 2);
-            b[j] = getBitFromWordArray(state[j], i + 1);
-            c[j] = getBitFromWordArray(state[j], i);
-        }
-
-        mpc_AND(a, b, ab, rand, views);
-        mpc_AND(b, c, bc, rand, views);
-        mpc_AND(c, a, ca, rand, views);
-
-        for (j = 0; j < 3; j++) {
-            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
-            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
-            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
-        }
-    }
-}
-
-void mpc_LowMC(randomTape_t* tapes, view_t views[3],
-               const uint32_t* plaintext, uint32_t* slab, paramset_t* params)
-{
-    uint32_t* keyShares[3];
-    uint32_t* state[3];
-    uint32_t* roundKey[3];
-
-    roundKey[0] = slab;
-    roundKey[1] = slab + params->stateSizeWords;
-    roundKey[2] = roundKey[1] + params->stateSizeWords;
-    state[0] = roundKey[2] + params->stateSizeWords;
-    state[1] = state[0] + params->stateSizeWords;
-    state[2] = state[1] + params->stateSizeWords;
-
-    memset(roundKey[0], 0, 3 * params->stateSizeBytes);
-    int i;
-    for (i = 0; i < 3; i++) {
-        keyShares[i] = views[i].inputShare;
-        memset(state[i], 0x00, params->stateSizeBytes);
-    }
-    mpc_xor_constant(state, plaintext, params->stateSizeWords);
-
-    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 3);
-    mpc_xor(state, roundKey, params->stateSizeWords, 3);
-
-    uint32_t r;
-    for (r = 1; r <= params->numRounds; r++) {
-        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 3);
-        mpc_substitution(state, tapes, views, params);
-        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 3);
-        mpc_xor_constant(state, RConstant(r - 1, params), params->stateSizeWords);
-        mpc_xor(state, roundKey, params->stateSizeWords, 3);
-    }
-
-    for (i = 0; i < 3; i++) {
-        memcpy(views[i].outputShare, state[i], params->stateSizeBytes);
-    }
-
-}
-
-void runMPC(view_t views[3], randomTape_t* rand,
-            uint32_t* plaintext, uint32_t* slab, paramset_t* params)
-{
-    rand->pos = 0;
-    mpc_LowMC(rand, views, plaintext, slab, params);
-}
-
-
-seeds_t* computeSeeds(uint32_t* privateKey, uint32_t*
-                      publicKey, uint32_t* plaintext, const uint8_t* message, size_t messageByteLength, paramset_t* params)
-{
-    HashInstance ctx;
-    seeds_t* allSeeds = allocateSeeds(params);
-
-    HashInit(&ctx, params, HASH_PREFIX_NONE);
-    HashUpdate(&ctx, (uint8_t*)privateKey, params->stateSizeBytes);
-    HashUpdate(&ctx, message, messageByteLength);
-    HashUpdate(&ctx, (uint8_t*)publicKey, params->stateSizeBytes);
-    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
-    uint16_t stateSizeBitsLE = toLittleEndian((uint16_t)params->stateSizeBits);
-    HashUpdate(&ctx, ((uint8_t*)&stateSizeBitsLE), sizeof(uint16_t));
-    HashFinal(&ctx);
-
-    HashSqueeze(&ctx, getSeed(allSeeds, 0, 0), params->seedSizeBytes * 3 * params->numZKBRounds);
-
-    return allSeeds;
-}
-
-int sign(uint32_t* privateKey, uint32_t* pubKey, uint32_t* plaintext, const uint8_t* message,
-         size_t messageByteLength, signature_t* sig, paramset_t* params)
-{
-    bool status;
-
-    /* Allocate views and commitments for all parallel iterations */
-    view_t** views = allocateViews(params);
-    commitments_t* as = allocateCommitments(params);
-    g_commitments_t* gs = allocateGCommitments(params);
-
-    /* Compute seeds for all parallel iterations */
-    seeds_t* seeds = computeSeeds(privateKey, pubKey, plaintext, message, messageByteLength, params);
-
-    //Allocate a random tape (re-used per parallel iteration), and a temporary buffer
-    randomTape_t tape;
-
-    allocateRandomTape(&tape, params);
-    uint8_t* tmp = malloc( MAX(9 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
-
-    uint32_t k;
-    for (k = 0; k < params->numZKBRounds; k++) {
-        // for first two players get all tape INCLUDING INPUT SHARE from seed
-        int j;
-        for (j = 0; j < 2; j++) {
-            status = createRandomTape(getSeed(seeds, k, j), tmp, params->stateSizeBytes + params->andSizeBytes, params);
-            if (!status) {
-                return EXIT_FAILURE;
-            }
-
-            memcpy(views[k][j].inputShare, tmp, params->stateSizeBytes);
-            memcpy(tape.tape[j], tmp + params->stateSizeBytes, params->andSizeBytes);
-        }
-        // Now set third party's wires. The random bits are from the seed, the input is
-        // the XOR of other two inputs and the private key
-        status = createRandomTape(getSeed(seeds, k, 2), tape.tape[2], params->andSizeBytes, params);
-        if (!status) {
-            return EXIT_FAILURE;
-        }
-        uint32_t j1;
-        for (j1 = 0; j1 < params->stateSizeWords; j1++) {
-            views[k][2].inputShare[j1] = privateKey[j1]
-                                        ^ views[k][0].inputShare[j1]
-                                        ^ views[k][1].inputShare[j1];
-        }
-
-        runMPC(views[k], &tape, plaintext, (uint32_t*)tmp, params);
-
-        //Committing
-        Commit(getSeed(seeds, k, 0), views[k][0], as[k].hashes[0], params);
-        Commit(getSeed(seeds, k, 1), views[k][1], as[k].hashes[1], params);
-        Commit(getSeed(seeds, k, 2), views[k][2], as[k].hashes[2], params);
-
-        if (params->transform == TRANSFORM_UR) {
-            G(0, getSeed(seeds, k, 0), &views[k][0], gs[k].G[0], params);
-            G(1, getSeed(seeds, k, 1), &views[k][1], gs[k].G[1], params);
-            G(2, getSeed(seeds, k, 2), &views[k][2], gs[k].G[2], params);
-        }
-    }
-
-    //Generating challenges
-    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
-
-    size_t ii, jj;
-    for (ii = 0; ii < params->numZKBRounds; ii++)
-        for (jj = 0; jj < 3; jj++)
-            VIEW_OUTPUTS(ii, jj) = views[ii][jj].outputShare;
-
-
-    uint32_t output[LOWMC_MAX_STATE_SIZE];
-    uint32_t j;
-    for (j = 0; j < params->stateSizeWords; j++)
-        output[j] = (VIEW_OUTPUTS(0, 0))[j] ^ (VIEW_OUTPUTS(0, 1))[j] ^ (VIEW_OUTPUTS(0, 2))[j];
-
-
-    H3(output, plaintext, viewOutputs, as,
-       sig->challengeBits, message, messageByteLength, gs, params);
-
-    //Packing Z
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        proof_t* proof = &sig->proofs[i];
-        prove(proof, getChallenge(sig->challengeBits, i), &seeds[i],
-              views[i], &as[i], (gs == NULL) ? NULL : &gs[i], params);
-    }
-
-    free(tmp);
-
-    freeViews(views, params);
-    freeCommitments(as);
-    freeRandomTape(&tape);
-    freeGCommitments(gs);
-    free(viewOutputs);
-    freeSeeds(seeds);
-
-    return EXIT_SUCCESS;
-}
-
-/*** Serialization functions ***/
-
-int serializeSignature(const signature_t* sig, uint8_t* sigBytes, size_t sigBytesLen, paramset_t* params)
-{
-    const proof_t* proofs = sig->proofs;
-    const uint8_t* challengeBits = sig->challengeBits;
-
-    /* Validate input buffer is large enough */
-    size_t bytesRequired = numBytes(2 * params->numZKBRounds) +
-                           params->numZKBRounds * (2 * params->seedSizeBytes + params->stateSizeBytes + params->andSizeBytes + params->digestSizeBytes);
-
-    if (params->transform == TRANSFORM_UR) {
-        bytesRequired += params->UnruhGWithoutInputBytes * params->numZKBRounds;
-    }
-
-    if (sigBytesLen < bytesRequired) {
-        return -1;
-    }
-
-    uint8_t* sigBytesBase = sigBytes;
-
-    memcpy(sigBytes, challengeBits, numBytes(2 * params->numZKBRounds));
-    sigBytes += numBytes(2 * params->numZKBRounds);
-
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-
-        uint8_t challenge = getChallenge(challengeBits, i);
-
-        memcpy(sigBytes, proofs[i].view3Commitment, params->digestSizeBytes);
-        sigBytes += params->digestSizeBytes;
-
-        if (params->transform == TRANSFORM_UR) {
-            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-            memcpy(sigBytes, proofs[i].view3UnruhG, view3UnruhLength);
-            sigBytes += view3UnruhLength;
-        }
-
-        memcpy(sigBytes, proofs[i].communicatedBits, params->andSizeBytes);
-        sigBytes += params->andSizeBytes;
-
-        memcpy(sigBytes, proofs[i].seed1, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        memcpy(sigBytes, proofs[i].seed2, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        if (challenge == 1 || challenge == 2) {
-            memcpy(sigBytes, proofs[i].inputShare, params->stateSizeBytes);
-            sigBytes += params->stateSizeBytes;
-        }
-
-
-    }
-
-    return (int)(sigBytes - sigBytesBase);
-}
-
-
-static size_t computeInputShareSize(const uint8_t* challengeBits, size_t stateSizeBytes, paramset_t* params)
-{
-    /* When the FS transform is used, the input share is included in the proof
-     * only when the challenge is 1 or 2.  When dersializing, to compute the
-     * number of bytes expected, we must check how many challenge values are 1
-     * or 2. The parameter stateSizeBytes is the size of an input share. */
-    size_t inputShareSize = 0;
-
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-        uint8_t challenge = getChallenge(challengeBits, i);
-        if (challenge == 1 || challenge == 2) {
-            inputShareSize += stateSizeBytes;
-        }
-    }
-    return inputShareSize;
-}
-
-int deserializeSignature(signature_t* sig, const uint8_t* sigBytes,
-                         size_t sigBytesLen, paramset_t* params)
-{
-    proof_t* proofs = sig->proofs;
-    uint8_t* challengeBits = sig->challengeBits;
-
-    /* Validate input buffer is large enough */
-    if (sigBytesLen < numBytes(2 * params->numZKBRounds)) {     /* ensure the input has at least the challenge */
-        return EXIT_FAILURE;
-    }
-    size_t inputShareSize = computeInputShareSize(sigBytes, params->stateSizeBytes, params);
-    size_t bytesExpected = numBytes(2 * params->numZKBRounds) +
-                           params->numZKBRounds * (2 * params->seedSizeBytes + params->andSizeBytes + params->digestSizeBytes) + inputShareSize;
-    if (params->transform == TRANSFORM_UR) {
-        bytesExpected += params->UnruhGWithoutInputBytes * params->numZKBRounds;
-    }
-    if (sigBytesLen < bytesExpected) {
-        return EXIT_FAILURE;
-    }
-
-    memcpy(challengeBits, sigBytes, numBytes(2 * params->numZKBRounds));
-    sigBytes += numBytes(2 * params->numZKBRounds);
-
-    size_t i;
-    for (i = 0; i < params->numZKBRounds; i++) {
-
-        uint8_t challenge = getChallenge(challengeBits, i);
-
-        memcpy(proofs[i].view3Commitment, sigBytes, params->digestSizeBytes);
-        sigBytes += params->digestSizeBytes;
-
-        if (params->transform == TRANSFORM_UR) {
-            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
-            memcpy(proofs[i].view3UnruhG, sigBytes, view3UnruhLength);
-            sigBytes += view3UnruhLength;
-        }
-
-        memcpy(proofs[i].communicatedBits, sigBytes, params->andSizeBytes);
-        sigBytes += params->andSizeBytes;
-
-        memcpy(proofs[i].seed1, sigBytes, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        memcpy(proofs[i].seed2, sigBytes, params->seedSizeBytes);
-        sigBytes += params->seedSizeBytes;
-
-        if (challenge == 1 || challenge == 2) {
-            memcpy(proofs[i].inputShare, sigBytes, params->stateSizeBytes);
-            sigBytes += params->stateSizeBytes;
-        }
-
-    }
-
-    return EXIT_SUCCESS;
-}
-
-
-
-
+/*! @file picnic_impl.c
+ *  @brief This is the main file of the signature scheme. All of the LowMC MPC
+ *  code is here as well as lower-level versions of sign and verify that are
+ *  called by the signature API.
+ *
+ *  This file is part of the reference implementation of the Picnic signature scheme.
+ *  See the accompanying documentation for complete details.
+ *
+ *  The code is provided under the MIT license, see LICENSE for
+ *  more details.
+ *  SPDX-License-Identifier: MIT
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#if defined (__WIN32)
+	#include <windows.h>
+	#include <bcrypt.h>
+#elif defined (__APPLE__)
+    #include "macos_specific_endian.h"
+#elif defined (DAP_OS_LINUX)
+    #include <endian.h>
+#endif
+
+#include "picnic_impl.h"
+#include "picnic.h"
+#include "platform.h"
+#include "lowmc_constants.h"
+#include "hash.h"
+#include "picnic_types.h"
+#include "dap_common.h"
+
+
+#define VIEW_OUTPUTS(i, j) viewOutputs[(i) * 3 + (j)]
+
+
+/* Helper functions */
+uint16_t toLittleEndian(uint16_t x)
+{
+#if defined(__WIN32)
+    #if BYTE_ORDER == LITTLE_ENDIAN
+		return x;
+	#else
+		return __builtin_bswap16(x);
+    #endif
+#else
+    return htole16(x);
+#endif
+}
+
+/* Get one bit from a byte array */
+uint8_t getBit(const uint8_t* array, uint32_t bitNumber)
+{
+    return (array[bitNumber / 8] >> (7 - (bitNumber % 8))) & 0x01;
+}
+
+/* Get one bit from a 32-bit int array */
+uint8_t getBitFromWordArray(const uint32_t* array, uint32_t bitNumber)
+{
+    return getBit((uint8_t*)array, bitNumber);
+}
+
+/* Set a specific bit in a byte array to a given value */
+void setBit(uint8_t* bytes, uint32_t bitNumber, uint8_t val)
+{
+    bytes[bitNumber / 8] = (bytes[bitNumber >> 3]
+                            & ~(1 << (7 - (bitNumber % 8)))) | (val << (7 - (bitNumber % 8)));
+}
+
+/* Set a specific bit in a byte array to a given value */
+void setBitInWordArray(uint32_t* array, uint32_t bitNumber, uint8_t val)
+{
+    setBit((uint8_t*)array, bitNumber, val);
+}
+
+static uint8_t parity(uint32_t* data, size_t len)
+{
+    uint32_t x = data[0];
+    size_t i;
+    for (i = 1; i < len; i++) {
+        x ^= data[i];
+    }
+
+    /* Compute parity of x using code from Section 5-2 of
+     * H.S. Warren, *Hacker's Delight*, Pearson Education, 2003.
+     * http://www.hackersdelight.org/hdcodetxt/parity.c.txt
+     */
+    uint32_t y = x ^ (x >> 1);
+    y ^= (y >> 2);
+    y ^= (y >> 4);
+    y ^= (y >> 8);
+    y ^= (y >> 16);
+    return y & 1;
+}
+
+uint32_t numBytes(uint32_t numBits)
+{
+    return (numBits == 0) ? 0 : ((numBits - 1) / 8 + 1);
+}
+
+static void xor_array(const uint32_t * in1, const uint32_t * in2, uint32_t * out, uint32_t numBytes)
+{
+    uint32_t i;
+    for (i = 0; i < numBytes; i++) {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+static void matrix_mul(
+    uint32_t* state,
+    const uint32_t* matrix,
+    uint32_t* output,
+    paramset_t* params)
+{
+    // Use temp to correctly handle the case when state = output
+    uint32_t prod[LOWMC_MAX_STATE_SIZE];
+    uint32_t temp[LOWMC_MAX_STATE_SIZE];
+
+    uint32_t i, j;
+    for (i = 0; i < params->stateSizeBits; i++) {
+        for (j = 0; j < params->stateSizeWords; j++) {
+            size_t index = i * params->stateSizeWords + j;
+            prod[j] = (state[j] & matrix[index]);
+        }
+        setBit((uint8_t*)temp, i, parity(&prod[0], params->stateSizeWords));
+
+    }
+    memcpy(output, &temp, params->stateSizeWords * sizeof(uint32_t));
+}
+
+static void substitution(uint32_t* state, paramset_t* params)
+{
+    uint32_t i;
+    for (i = 0; i < params->numSboxes * 3; i += 3) {
+        uint8_t a = getBitFromWordArray(state, i + 2);
+        uint8_t b = getBitFromWordArray(state, i + 1);
+        uint8_t c = getBitFromWordArray(state, i);
+
+        setBitInWordArray(state, i + 2, a ^ (b & c));
+        setBitInWordArray(state, i + 1, a ^ b ^ (a & c));
+        setBitInWordArray(state, i, a ^ b ^ c ^ (a & b));
+    }
+}
+
+void LowMCEnc(const uint32_t* plaintext, uint32_t* output, uint32_t* key, paramset_t* params)
+{
+    uint32_t roundKey[LOWMC_MAX_STATE_SIZE / sizeof(uint32_t)];
+
+    if (plaintext != output) {
+        /* output will hold the intermediate state */
+        memcpy(output, plaintext, params->stateSizeBytes);
+    }
+
+    matrix_mul(key, KMatrix(0, params), roundKey, params);
+    xor_array(output, roundKey, output, params->stateSizeWords);
+
+    uint32_t r;
+    for (r = 1; r <= params->numRounds; r++) {
+        matrix_mul(key, KMatrix(r, params), roundKey, params);
+        substitution(output, params);
+        matrix_mul(output, LMatrix(r - 1, params), output, params);
+        xor_array(output, RConstant(r - 1, params), output, params->stateSizeWords);
+        xor_array(output, roundKey, output, params->stateSizeWords);
+    }
+
+}
+
+bool createRandomTape(const uint8_t* seed, uint8_t* tape,
+                      uint32_t tapeLengthBytes, paramset_t* params)
+{
+    HashInstance ctx;
+
+    if (tapeLengthBytes < params->digestSizeBytes) {
+        return false;
+    }
+
+    /* Hash the seed and a constant, store the result in tape. */
+    HashInit(&ctx, params, HASH_PREFIX_2);
+    HashUpdate(&ctx, seed, params->seedSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, tape, params->digestSizeBytes);
+
+    /* Expand the hashed seed and output length to create the tape. */
+    HashInit(&ctx, params, HASH_PREFIX_NONE);
+    HashUpdate(&ctx, tape, params->digestSizeBytes);
+    uint16_t outputBytesLE = toLittleEndian(tapeLengthBytes);
+    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, tape, tapeLengthBytes);
+
+    return true;
+}
+
+void mpc_xor(uint32_t* state[3], uint32_t* in[3], uint32_t len, int players)
+{
+    uint8_t i;
+    for (i = 0; i < players; i++) {
+        xor_array(state[i], in[i], state[i], len);
+    }
+}
+
+/* Compute the XOR of in with the first state vectors. */
+void mpc_xor_constant(uint32_t* state[3], const uint32_t* in, uint32_t len)
+{
+    xor_array(state[0], in, state[0], len);
+}
+
+void mpc_xor_constant_verify(uint32_t* state[2], const uint32_t* in, uint32_t len, uint8_t challenge)
+{
+    /* During verify, where the first share is stored in state depends on the challenge */
+    if (challenge == 0) {
+        xor_array(state[0], in, state[0], len);
+    }
+    else if (challenge == 2) {
+        xor_array(state[1], in, state[1], len);
+    }
+}
+
+
+void Commit(const uint8_t* seed, const view_t view,
+            uint8_t* hash, paramset_t* params)
+{
+    HashInstance ctx;
+
+    /* Hash the seed, store result in `hash` */
+    HashInit(&ctx, params, HASH_PREFIX_4);
+    HashUpdate(&ctx, seed, params->seedSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, hash, params->digestSizeBytes);
+
+    /* Compute H_0(H_4(seed), view) */
+    HashInit(&ctx, params, HASH_PREFIX_0);
+    HashUpdate(&ctx, hash, params->digestSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)view.inputShare, params->stateSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)view.communicatedBits, params->andSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)view.outputShare, params->stateSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, hash, params->digestSizeBytes);
+}
+
+/* This is the random "permuatation" function G for Unruh's transform */
+void G(uint8_t viewNumber, const uint8_t* seed, view_t* view, uint8_t* output, paramset_t* params)
+{
+    HashInstance ctx;
+    uint16_t outputBytes = params->seedSizeBytes + params->andSizeBytes;
+
+    /* Hash the seed with H_5, store digest in output */
+    HashInit(&ctx, params, HASH_PREFIX_5);
+    HashUpdate(&ctx, seed, params->seedSizeBytes);
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, output, params->digestSizeBytes);
+
+    /* Hash H_5(seed), the view, and the length */
+    HashInit(&ctx, params, HASH_PREFIX_NONE);
+    HashUpdate(&ctx, output, params->digestSizeBytes);
+    if (viewNumber == 2) {
+        HashUpdate(&ctx, (uint8_t*)view->inputShare, params->stateSizeBytes);
+        outputBytes += (uint16_t)params->stateSizeBytes;
+    }
+    HashUpdate(&ctx, view->communicatedBits, params->andSizeBytes);
+
+    uint16_t outputBytesLE = toLittleEndian(outputBytes);
+    HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t));
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, output, outputBytes);
+}
+
+void setChallenge(uint8_t* challenge, size_t round, uint8_t trit)
+{
+    /* challenge must have length numBytes(numZKBRounds*2)
+     * 0 <= index < numZKBRounds
+     * trit must be in {0,1,2} */
+    uint32_t roundU32 = (uint32_t)round;
+
+    setBit(challenge, 2 * roundU32, trit & 1);
+    setBit(challenge, 2 * roundU32 + 1, (trit >> 1) & 1);
+}
+
+uint8_t getChallenge(const uint8_t* challenge, size_t round)
+{
+    uint32_t roundU32 = (uint32_t)round;
+
+    return (getBit(challenge, 2 * roundU32 + 1) << 1) | getBit(challenge, 2 * roundU32);
+}
+
+void H3(const uint32_t* circuitOutput, const uint32_t* plaintext, uint32_t** viewOutputs,
+        commitments_t* as,
+        uint8_t* challengeBits, const uint8_t* message, size_t messageByteLength,
+        g_commitments_t* gs, paramset_t* params)
+{
+    uint8_t* hash = malloc(params->digestSizeBytes);
+
+    HashInstance ctx;
+
+    /* Depending on the number of rounds, we might not set part of the last
+     * byte, make sure it's always zero. */
+    challengeBits[numBytes(params->numZKBRounds * 2) - 1] = 0;
+
+    /* Hash input data */
+    HashInit(&ctx, params, HASH_PREFIX_1);
+
+    /* Hash the output share from each view */
+    uint32_t i;
+    int j;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        for (j = 0; j < 3; j++) {
+            HashUpdate(&ctx, (uint8_t*)VIEW_OUTPUTS(i, j), params->stateSizeBytes);
+        }
+    }
+
+    /* Hash all the commitments C */
+    for (i = 0; i < params->numZKBRounds; i++) {
+        for (j = 0; j < 3; j++) {
+            HashUpdate(&ctx, as[i].hashes[j], params->digestSizeBytes);
+        }
+    }
+
+    /* Hash all the commitments G */
+    if (params->transform == TRANSFORM_UR) {
+        for (i = 0; i < params->numZKBRounds; i++) {
+            for (j = 0; j < 3; j++) {
+                size_t view3UnruhLength = (j == 2) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+                HashUpdate(&ctx, gs[i].G[j], view3UnruhLength);
+            }
+        }
+    }
+
+    HashUpdate(&ctx, (uint8_t*)circuitOutput, params->stateSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
+    HashUpdate(&ctx, message, messageByteLength);
+
+    HashFinal(&ctx);
+    HashSqueeze(&ctx, hash, params->digestSizeBytes);
+
+    /* Convert hash to a packed string of values in {0,1,2} */
+    size_t byte_count, round = 0;
+    while (1) {
+        for (byte_count = 0; byte_count < params->digestSizeBytes; byte_count++) {
+            uint8_t byte = hash[byte_count];
+            /* iterate over each pair of bits in the byte */
+            for (j = 0; j < 8; j += 2) {
+                uint8_t bitPair = ((byte >> (6 - j)) & 0x03);
+                if (bitPair < 3) {
+                    setChallenge(challengeBits, round, bitPair);
+                    round++;
+                    if (round == params->numZKBRounds) {
+                        goto done;
+                    }
+                }
+            }
+        }
+
+        /* We need more bits; hash set hash = H_1(hash) */
+        HashInit(&ctx, params, HASH_PREFIX_1);
+        HashUpdate(&ctx, hash, params->digestSizeBytes);
+        HashFinal(&ctx);
+        HashSqueeze(&ctx, hash, params->digestSizeBytes);
+    }
+
+done:
+
+    free(hash);
+    return;
+}
+
+/* Caller must allocate the first parameter */
+void prove(proof_t* proof, uint8_t challenge, seeds_t* seeds,
+           view_t views[3], commitments_t* commitments, g_commitments_t* gs, paramset_t* params)
+{
+    if (challenge == 0) {
+        memcpy(proof->seed1, seeds->seed0, params->seedSizeBytes);
+        memcpy(proof->seed2, seeds->seed1, params->seedSizeBytes);
+    }
+    else if (challenge == 1) {
+        memcpy(proof->seed1, seeds->seed1, params->seedSizeBytes);
+        memcpy(proof->seed2, seeds->seed2, params->seedSizeBytes);
+    }
+    else if (challenge == 2) {
+        memcpy(proof->seed1, seeds->seed2, params->seedSizeBytes);
+        memcpy(proof->seed2, seeds->seed0, params->seedSizeBytes);
+    }
+    else {
+        assert(!"Invalid challenge");
+    }
+
+    if (challenge == 1 || challenge == 2) {
+        memcpy(proof->inputShare, views[2].inputShare, params->stateSizeBytes);
+    }
+    memcpy(proof->communicatedBits, views[(challenge + 1) % 3].communicatedBits, params->andSizeBytes);
+
+    memcpy(proof->view3Commitment, commitments->hashes[(challenge + 2) % 3], params->digestSizeBytes);
+    if (params->transform == TRANSFORM_UR) {
+        size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+        memcpy(proof->view3UnruhG, gs->G[(challenge + 2) % 3], view3UnruhLength);
+    }
+}
+
+void mpc_AND_verify(uint8_t in1[2], uint8_t in2[2], uint8_t out[2],
+                    randomTape_t* rand, view_t* view1, view_t* view2)
+{
+    uint8_t r[2] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos) };
+
+    out[0] = (in1[0] & in2[1]) ^ (in1[1] & in2[0]) ^ (in1[0] & in2[0]) ^ r[0] ^ r[1];
+    setBit(view1->communicatedBits, rand->pos, out[0]);
+    out[1] = getBit(view2->communicatedBits, rand->pos);
+
+    (rand->pos)++;
+}
+
+void mpc_substitution_verify(uint32_t* state[2], randomTape_t* rand, view_t* view1,
+                             view_t* view2, paramset_t* params)
+{
+    uint32_t i;
+    for (i = 0; i < params->numSboxes * 3; i += 3) {
+
+        uint8_t a[2];
+        uint8_t b[2];
+        uint8_t c[2];
+
+        uint8_t j;
+        for (j = 0; j < 2; j++) {
+            a[j] = getBitFromWordArray(state[j], i + 2);
+            b[j] = getBitFromWordArray(state[j], i + 1);
+            c[j] = getBitFromWordArray(state[j], i);
+        }
+
+        uint8_t ab[2];
+        uint8_t bc[2];
+        uint8_t ca[2];
+
+        mpc_AND_verify(a, b, ab, rand, view1, view2);
+        mpc_AND_verify(b, c, bc, rand, view1, view2);
+        mpc_AND_verify(c, a, ca, rand, view1, view2);
+
+        for (j = 0; j < 2; j++) {
+            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
+            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
+            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
+        }
+    }
+}
+
+void mpc_matrix_mul(uint32_t* state[3], const uint32_t* matrix,
+                    uint32_t* output[3], paramset_t* params, size_t players)
+{
+    uint32_t player;
+    for (player = 0; player < players; player++) {
+        matrix_mul(state[player], matrix, output[player], params);
+    }
+}
+
+void mpc_LowMC_verify(view_t* view1, view_t* view2,
+                      randomTape_t* tapes, uint32_t* tmp,
+                      const uint32_t* plaintext, paramset_t* params, uint8_t challenge)
+{
+    uint32_t* state[2];
+    uint32_t* keyShares[2];
+    uint32_t* roundKey[2];
+
+    roundKey[0] = tmp;
+    roundKey[1] = roundKey[0] + params->stateSizeWords;
+    state[0] = roundKey[1] + params->stateSizeWords;
+    state[1] = state[0] + params->stateSizeWords;
+
+    // initialize both roundkeys to 0. they are contingent
+    memset(roundKey[0], 0, 2 * params->stateSizeBytes);
+
+    uint32_t i, r;
+    for (i = 0; i < 2; i++) {
+        memset(state[i], 0x00, params->stateSizeBytes);
+    }
+    mpc_xor_constant_verify(state, plaintext, params->stateSizeWords, challenge);
+
+    keyShares[0] = view1->inputShare;
+    keyShares[1] = view2->inputShare;
+
+    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 2);
+    mpc_xor(state, roundKey, params->stateSizeWords, 2);
+
+    for (r = 1; r <= params->numRounds; ++r) {
+        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 2);
+        mpc_substitution_verify(state, tapes, view1, view2, params);
+        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 2);
+        mpc_xor_constant_verify(state, RConstant(r - 1, params), params->stateSizeWords, challenge);
+        mpc_xor(state, roundKey, params->stateSizeWords, 2);
+    }
+
+    memcpy(view1->outputShare, state[0], params->stateSizeBytes);
+    memcpy(view2->outputShare, state[1], params->stateSizeBytes);
+}
+
+void verifyProof(const proof_t* proof, view_t* view1, view_t* view2,
+                 uint8_t challenge, uint8_t* tmp,
+                 const uint32_t* plaintext, randomTape_t* tape, paramset_t* params)
+{
+    memcpy(view2->communicatedBits, proof->communicatedBits, params->andSizeBytes);
+    tape->pos = 0;
+
+    bool status = false;
+    switch (challenge) {
+    case 0:
+        // in this case, both views' inputs are derivable from the input share
+
+        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
+        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        if (!status) {
+            break;
+        }
+        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
+        break;
+
+    case 1:
+        // in this case view2's input share was already given to us explicitly as
+        // it is not computable from the seed. We just need to compute view1's input from
+        // its seed
+        status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        memcpy(view1->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes);
+        status = status && createRandomTape(proof->seed2, tape->tape[1], params->andSizeBytes, params);
+        if (!status) {
+            break;
+        }
+        memcpy(view2->inputShare, proof->inputShare, params->stateSizeBytes);
+        break;
+
+    case 2:
+        // in this case view1's input share was already given to us explicitly as
+        // it is not computable from the seed. We just need to compute view2's input from
+        // its seed
+        status = createRandomTape(proof->seed1, tape->tape[0], params->andSizeBytes, params);
+        memcpy(view1->inputShare, proof->inputShare, params->stateSizeBytes);
+        status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params);
+        if (!status) {
+            break;
+        }
+        memcpy(view2->inputShare, tmp, params->stateSizeBytes);
+        memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes);
+        break;
+
+    default:
+        break;
+    }
+
+    mpc_LowMC_verify(view1, view2, tape, (uint32_t*)tmp, plaintext, params, challenge);
+}
+
+int verify(signature_t* sig, const uint32_t* pubKey, const uint32_t* plaintext,
+           const uint8_t* message, size_t messageByteLength, paramset_t* params)
+{
+    commitments_t* as = allocateCommitments(params);
+    g_commitments_t* gs = allocateGCommitments(params);
+
+    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
+    const proof_t* proofs = sig->proofs;
+
+    const uint8_t* received_challengebits = sig->challengeBits;
+    int status = EXIT_SUCCESS;
+    uint8_t* computed_challengebits = NULL;
+    uint32_t* view3Slab = NULL;
+
+    uint8_t* tmp = malloc(MAX(6 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
+
+    randomTape_t* tape = (randomTape_t*)malloc(sizeof(randomTape_t));
+
+    allocateRandomTape(tape, params);
+
+    view_t* view1s = malloc(params->numZKBRounds * sizeof(view_t));
+    view_t* view2s = malloc(params->numZKBRounds * sizeof(view_t));
+
+    /* Allocate a slab of memory for the 3rd view's output in each round */
+    view3Slab = malloc(params->stateSizeBytes * params->numZKBRounds);
+    uint32_t* view3Output = view3Slab;     /* pointer into the slab to the current 3rd view */
+
+    size_t i, j;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        allocateView(&view1s[i], params);
+        allocateView(&view2s[i], params);
+
+        // last bits of communicatedBits may not be set so zero them
+        view1s[i].communicatedBits[params->andSizeBytes - 1] = 0;
+
+        verifyProof(&proofs[i], &view1s[i], &view2s[i],
+                    getChallenge(received_challengebits, i),
+                    tmp, plaintext, tape, params);
+
+        // create ordered array of commitments with order computed based on the challenge
+        // check commitments of the two opened views
+        uint8_t challenge = getChallenge(received_challengebits, i);
+        Commit(proofs[i].seed1, view1s[i], as[i].hashes[challenge], params);
+        Commit(proofs[i].seed2, view2s[i], as[i].hashes[(challenge + 1) % 3], params);
+        memcpy(as[i].hashes[(challenge + 2) % 3], proofs[i].view3Commitment, params->digestSizeBytes);
+
+        if (params->transform == TRANSFORM_UR) {
+            G(challenge, proofs[i].seed1, &view1s[i], gs[i].G[challenge], params);
+            G((challenge + 1) % 3, proofs[i].seed2, &view2s[i], gs[i].G[(challenge + 1) % 3], params);
+            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+            memcpy(gs[i].G[(challenge + 2) % 3], proofs[i].view3UnruhG, view3UnruhLength);
+        }
+
+        VIEW_OUTPUTS(i, challenge) = view1s[i].outputShare;
+        VIEW_OUTPUTS(i, (challenge + 1) % 3) = view2s[i].outputShare;
+        for (j = 0; j < params->stateSizeWords; j++) {
+            view3Output[j] = view1s[i].outputShare[j] ^ view2s[i].outputShare[j]
+                             ^ pubKey[j];
+        }
+        VIEW_OUTPUTS(i, (challenge + 2) % 3) = view3Output;
+        view3Output += params->stateSizeWords;
+    }
+
+    computed_challengebits = malloc(numBytes(2 * params->numZKBRounds));
+
+    H3(pubKey, plaintext, viewOutputs, as,
+       computed_challengebits, message, messageByteLength, gs, params);
+
+    if (computed_challengebits != NULL &&
+        memcmp(received_challengebits, computed_challengebits,
+               numBytes(2 * params->numZKBRounds)) != 0) {
+        status = EXIT_FAILURE;
+    }
+
+    free(computed_challengebits);
+    free(view3Slab);
+
+    freeCommitments(as);
+    for (i = 0; i < params->numZKBRounds; i++) {
+        freeView(&view1s[i]);
+        freeView(&view2s[i]);
+    }
+    free(view1s);
+    free(view2s);
+    free(tmp);
+    freeRandomTape(tape);
+    free(tape);
+    freeGCommitments(gs);
+    free(viewOutputs);
+
+    return status;
+}
+
+/*** Functions implementing Sign ***/
+
+void mpc_AND(uint8_t in1[3], uint8_t in2[3], uint8_t out[3], randomTape_t* rand,
+             view_t views[3])
+{
+    uint8_t r[3] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos), getBit(rand->tape[2], rand->pos) };
+
+    uint8_t i;
+    for (i = 0; i < 3; i++) {
+        out[i] = (in1[i] & in2[(i + 1) % 3]) ^ (in1[(i + 1) % 3] & in2[i])
+                 ^ (in1[i] & in2[i]) ^ r[i] ^ r[(i + 1) % 3];
+
+        setBit(views[i].communicatedBits, rand->pos, out[i]);
+    }
+
+    (rand->pos)++;
+}
+
+void mpc_substitution(uint32_t* state[3], randomTape_t* rand, view_t views[3],
+                      paramset_t* params)
+{
+    uint8_t a[3];
+    uint8_t b[3];
+    uint8_t c[3];
+
+    uint8_t ab[3];
+    uint8_t bc[3];
+    uint8_t ca[3];
+
+    uint32_t i;
+    for (i = 0; i < params->numSboxes * 3; i += 3) {
+
+        uint8_t j;
+        for (j = 0; j < 3; j++) {
+            a[j] = getBitFromWordArray(state[j], i + 2);
+            b[j] = getBitFromWordArray(state[j], i + 1);
+            c[j] = getBitFromWordArray(state[j], i);
+        }
+
+        mpc_AND(a, b, ab, rand, views);
+        mpc_AND(b, c, bc, rand, views);
+        mpc_AND(c, a, ca, rand, views);
+
+        for (j = 0; j < 3; j++) {
+            setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j]));
+            setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j]));
+            setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j]));
+        }
+    }
+}
+
+void mpc_LowMC(randomTape_t* tapes, view_t views[3],
+               const uint32_t* plaintext, uint32_t* slab, paramset_t* params)
+{
+    uint32_t* keyShares[3];
+    uint32_t* state[3];
+    uint32_t* roundKey[3];
+
+    roundKey[0] = slab;
+    roundKey[1] = slab + params->stateSizeWords;
+    roundKey[2] = roundKey[1] + params->stateSizeWords;
+    state[0] = roundKey[2] + params->stateSizeWords;
+    state[1] = state[0] + params->stateSizeWords;
+    state[2] = state[1] + params->stateSizeWords;
+
+    memset(roundKey[0], 0, 3 * params->stateSizeBytes);
+    int i;
+    for (i = 0; i < 3; i++) {
+        keyShares[i] = views[i].inputShare;
+        memset(state[i], 0x00, params->stateSizeBytes);
+    }
+    mpc_xor_constant(state, plaintext, params->stateSizeWords);
+
+    mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 3);
+    mpc_xor(state, roundKey, params->stateSizeWords, 3);
+
+    uint32_t r;
+    for (r = 1; r <= params->numRounds; r++) {
+        mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 3);
+        mpc_substitution(state, tapes, views, params);
+        mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 3);
+        mpc_xor_constant(state, RConstant(r - 1, params), params->stateSizeWords);
+        mpc_xor(state, roundKey, params->stateSizeWords, 3);
+    }
+
+    for (i = 0; i < 3; i++) {
+        memcpy(views[i].outputShare, state[i], params->stateSizeBytes);
+    }
+
+}
+
+void runMPC(view_t views[3], randomTape_t* rand,
+            uint32_t* plaintext, uint32_t* slab, paramset_t* params)
+{
+    rand->pos = 0;
+    mpc_LowMC(rand, views, plaintext, slab, params);
+}
+
+
+seeds_t* computeSeeds(uint32_t* privateKey, uint32_t*
+                      publicKey, uint32_t* plaintext, const uint8_t* message, size_t messageByteLength, paramset_t* params)
+{
+    HashInstance ctx;
+    seeds_t* allSeeds = allocateSeeds(params);
+
+    HashInit(&ctx, params, HASH_PREFIX_NONE);
+    HashUpdate(&ctx, (uint8_t*)privateKey, params->stateSizeBytes);
+    HashUpdate(&ctx, message, messageByteLength);
+    HashUpdate(&ctx, (uint8_t*)publicKey, params->stateSizeBytes);
+    HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes);
+    uint16_t stateSizeBitsLE = toLittleEndian((uint16_t)params->stateSizeBits);
+    HashUpdate(&ctx, ((uint8_t*)&stateSizeBitsLE), sizeof(uint16_t));
+    HashFinal(&ctx);
+
+    HashSqueeze(&ctx, getSeed(allSeeds, 0, 0), params->seedSizeBytes * 3 * params->numZKBRounds);
+
+    return allSeeds;
+}
+
+int sign(uint32_t* privateKey, uint32_t* pubKey, uint32_t* plaintext, const uint8_t* message,
+         size_t messageByteLength, signature_t* sig, paramset_t* params)
+{
+    bool status;
+
+    /* Allocate views and commitments for all parallel iterations */
+    view_t** views = allocateViews(params);
+    commitments_t* as = allocateCommitments(params);
+    g_commitments_t* gs = allocateGCommitments(params);
+
+    /* Compute seeds for all parallel iterations */
+    seeds_t* seeds = computeSeeds(privateKey, pubKey, plaintext, message, messageByteLength, params);
+
+    //Allocate a random tape (re-used per parallel iteration), and a temporary buffer
+    randomTape_t tape;
+
+    allocateRandomTape(&tape, params);
+    uint8_t* tmp = malloc( MAX(9 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes));
+
+    uint32_t k;
+    for (k = 0; k < params->numZKBRounds; k++) {
+        // for first two players get all tape INCLUDING INPUT SHARE from seed
+        int j;
+        for (j = 0; j < 2; j++) {
+            status = createRandomTape(getSeed(seeds, k, j), tmp, params->stateSizeBytes + params->andSizeBytes, params);
+            if (!status) {
+                return EXIT_FAILURE;
+            }
+
+            memcpy(views[k][j].inputShare, tmp, params->stateSizeBytes);
+            memcpy(tape.tape[j], tmp + params->stateSizeBytes, params->andSizeBytes);
+        }
+        // Now set third party's wires. The random bits are from the seed, the input is
+        // the XOR of other two inputs and the private key
+        status = createRandomTape(getSeed(seeds, k, 2), tape.tape[2], params->andSizeBytes, params);
+        if (!status) {
+            return EXIT_FAILURE;
+        }
+        uint32_t j1;
+        for (j1 = 0; j1 < params->stateSizeWords; j1++) {
+            views[k][2].inputShare[j1] = privateKey[j1]
+                                        ^ views[k][0].inputShare[j1]
+                                        ^ views[k][1].inputShare[j1];
+        }
+
+        runMPC(views[k], &tape, plaintext, (uint32_t*)tmp, params);
+
+        //Committing
+        Commit(getSeed(seeds, k, 0), views[k][0], as[k].hashes[0], params);
+        Commit(getSeed(seeds, k, 1), views[k][1], as[k].hashes[1], params);
+        Commit(getSeed(seeds, k, 2), views[k][2], as[k].hashes[2], params);
+
+        if (params->transform == TRANSFORM_UR) {
+            G(0, getSeed(seeds, k, 0), &views[k][0], gs[k].G[0], params);
+            G(1, getSeed(seeds, k, 1), &views[k][1], gs[k].G[1], params);
+            G(2, getSeed(seeds, k, 2), &views[k][2], gs[k].G[2], params);
+        }
+    }
+
+    //Generating challenges
+    uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*));
+
+    size_t ii, jj;
+    for (ii = 0; ii < params->numZKBRounds; ii++)
+        for (jj = 0; jj < 3; jj++)
+            VIEW_OUTPUTS(ii, jj) = views[ii][jj].outputShare;
+
+
+    uint32_t output[LOWMC_MAX_STATE_SIZE];
+    uint32_t j;
+    for (j = 0; j < params->stateSizeWords; j++)
+        output[j] = (VIEW_OUTPUTS(0, 0))[j] ^ (VIEW_OUTPUTS(0, 1))[j] ^ (VIEW_OUTPUTS(0, 2))[j];
+
+
+    H3(output, plaintext, viewOutputs, as,
+       sig->challengeBits, message, messageByteLength, gs, params);
+
+    //Packing Z
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        proof_t* proof = &sig->proofs[i];
+        prove(proof, getChallenge(sig->challengeBits, i), &seeds[i],
+              views[i], &as[i], (gs == NULL) ? NULL : &gs[i], params);
+    }
+
+    free(tmp);
+
+    freeViews(views, params);
+    freeCommitments(as);
+    freeRandomTape(&tape);
+    freeGCommitments(gs);
+    free(viewOutputs);
+    freeSeeds(seeds);
+
+    return EXIT_SUCCESS;
+}
+
+/*** Serialization functions ***/
+
+int serializeSignature(const signature_t* sig, uint8_t* sigBytes, size_t sigBytesLen, paramset_t* params)
+{
+    const proof_t* proofs = sig->proofs;
+    const uint8_t* challengeBits = sig->challengeBits;
+
+    /* Validate input buffer is large enough */
+    size_t bytesRequired = numBytes(2 * params->numZKBRounds) +
+                           params->numZKBRounds * (2 * params->seedSizeBytes + params->stateSizeBytes + params->andSizeBytes + params->digestSizeBytes);
+
+    if (params->transform == TRANSFORM_UR) {
+        bytesRequired += params->UnruhGWithoutInputBytes * params->numZKBRounds;
+    }
+
+    if (sigBytesLen < bytesRequired) {
+        return -1;
+    }
+
+    uint8_t* sigBytesBase = sigBytes;
+
+    memcpy(sigBytes, challengeBits, numBytes(2 * params->numZKBRounds));
+    sigBytes += numBytes(2 * params->numZKBRounds);
+
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+
+        uint8_t challenge = getChallenge(challengeBits, i);
+
+        memcpy(sigBytes, proofs[i].view3Commitment, params->digestSizeBytes);
+        sigBytes += params->digestSizeBytes;
+
+        if (params->transform == TRANSFORM_UR) {
+            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+            memcpy(sigBytes, proofs[i].view3UnruhG, view3UnruhLength);
+            sigBytes += view3UnruhLength;
+        }
+
+        memcpy(sigBytes, proofs[i].communicatedBits, params->andSizeBytes);
+        sigBytes += params->andSizeBytes;
+
+        memcpy(sigBytes, proofs[i].seed1, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        memcpy(sigBytes, proofs[i].seed2, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        if (challenge == 1 || challenge == 2) {
+            memcpy(sigBytes, proofs[i].inputShare, params->stateSizeBytes);
+            sigBytes += params->stateSizeBytes;
+        }
+
+
+    }
+
+    return (int)(sigBytes - sigBytesBase);
+}
+
+
+static size_t computeInputShareSize(const uint8_t* challengeBits, size_t stateSizeBytes, paramset_t* params)
+{
+    /* When the FS transform is used, the input share is included in the proof
+     * only when the challenge is 1 or 2.  When dersializing, to compute the
+     * number of bytes expected, we must check how many challenge values are 1
+     * or 2. The parameter stateSizeBytes is the size of an input share. */
+    size_t inputShareSize = 0;
+
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+        uint8_t challenge = getChallenge(challengeBits, i);
+        if (challenge == 1 || challenge == 2) {
+            inputShareSize += stateSizeBytes;
+        }
+    }
+    return inputShareSize;
+}
+
+int deserializeSignature(signature_t* sig, const uint8_t* sigBytes,
+                         size_t sigBytesLen, paramset_t* params)
+{
+    proof_t* proofs = sig->proofs;
+    uint8_t* challengeBits = sig->challengeBits;
+
+    /* Validate input buffer is large enough */
+    if (sigBytesLen < numBytes(2 * params->numZKBRounds)) {     /* ensure the input has at least the challenge */
+        return EXIT_FAILURE;
+    }
+    size_t inputShareSize = computeInputShareSize(sigBytes, params->stateSizeBytes, params);
+    size_t bytesExpected = numBytes(2 * params->numZKBRounds) +
+                           params->numZKBRounds * (2 * params->seedSizeBytes + params->andSizeBytes + params->digestSizeBytes) + inputShareSize;
+    if (params->transform == TRANSFORM_UR) {
+        bytesExpected += params->UnruhGWithoutInputBytes * params->numZKBRounds;
+    }
+    if (sigBytesLen < bytesExpected) {
+        return EXIT_FAILURE;
+    }
+
+    memcpy(challengeBits, sigBytes, numBytes(2 * params->numZKBRounds));
+    sigBytes += numBytes(2 * params->numZKBRounds);
+
+    size_t i;
+    for (i = 0; i < params->numZKBRounds; i++) {
+
+        uint8_t challenge = getChallenge(challengeBits, i);
+
+        memcpy(proofs[i].view3Commitment, sigBytes, params->digestSizeBytes);
+        sigBytes += params->digestSizeBytes;
+
+        if (params->transform == TRANSFORM_UR) {
+            size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes;
+            memcpy(proofs[i].view3UnruhG, sigBytes, view3UnruhLength);
+            sigBytes += view3UnruhLength;
+        }
+
+        memcpy(proofs[i].communicatedBits, sigBytes, params->andSizeBytes);
+        sigBytes += params->andSizeBytes;
+
+        memcpy(proofs[i].seed1, sigBytes, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        memcpy(proofs[i].seed2, sigBytes, params->seedSizeBytes);
+        sigBytes += params->seedSizeBytes;
+
+        if (challenge == 1 || challenge == 2) {
+            memcpy(proofs[i].inputShare, sigBytes, params->stateSizeBytes);
+            sigBytes += params->stateSizeBytes;
+        }
+
+    }
+
+    return EXIT_SUCCESS;
+}
+
+
+
+
-- 
GitLab