From dfc5302017c60553c25ad59ac9191b20de237c53 Mon Sep 17 00:00:00 2001 From: Dmitriy Gerasimov <naeper@demlabs.net> Date: Mon, 8 Feb 2021 21:54:41 +0700 Subject: [PATCH] [+] More BSD fixes --- dap-sdk/crypto/CMakeLists.txt | 31 +- dap-sdk/crypto/include/dap_crypto_common.h | 3 + dap-sdk/crypto/src/msrln/AMD64/consts.c | 80 +- dap-sdk/crypto/src/msrln/AMD64/error_asm.S | 872 ++++---- dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c | 130 +- dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S | 1956 ++++++++--------- dap-sdk/crypto/src/msrln/kex.c | 1287 ++++++----- dap-sdk/crypto/src/msrln/makefile | 188 +- dap-sdk/crypto/src/msrln/msrln.h | 272 +-- dap-sdk/crypto/src/msrln/msrln.pri | 12 +- dap-sdk/crypto/src/msrln/msrln_priv.h | 228 +- dap-sdk/crypto/src/msrln/random.c | 180 +- dap-sdk/crypto/src/sig_picnic/picnic_impl.c | 1996 +++++++++--------- 13 files changed, 3628 insertions(+), 3607 deletions(-) diff --git a/dap-sdk/crypto/CMakeLists.txt b/dap-sdk/crypto/CMakeLists.txt index 9e9660289a..1896bd9b0c 100755 --- a/dap-sdk/crypto/CMakeLists.txt +++ b/dap-sdk/crypto/CMakeLists.txt @@ -64,13 +64,34 @@ if(WIN32) endif() if(UNIX) - if(BUILD_64) - file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s ) - else() - file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c ) - endif() + if (LINUX) + if(BUILD_64) + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s ) + else() + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c ) + endif() + elseif(APPLE) + if(BUILD_64) + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas_Apple.s ) + else() + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c ) + endif() + elseif(BSD) + if(BUILD_64) + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Compact64/KeccakP-1600-compact64.c ) + else() + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c ) + endif() + else() + file( GLOB XKCP_SRCS2 src/XKCP/lib/low/KeccakP-1600/Inplace32BI/KeccakP-1600-inplace32BI.c ) + endif() + endif() + + + + add_library(${PROJECT_NAME} STATIC ${CRYPTO_SRCS} ${XKCP_SRCS} ${XKCP_SRCS2} ${CRYPTO_HEADERS} ) target_include_directories(dap_crypto PRIVATE src/seed src/rand src/iaes src/oaes src/sha3 src/msrln src/defeo_scheme src/sig_bliss src/sig_tesla src/sig_picnic src/sig_dilithium src include) diff --git a/dap-sdk/crypto/include/dap_crypto_common.h b/dap-sdk/crypto/include/dap_crypto_common.h index f213209536..67a3db6a94 100755 --- a/dap-sdk/crypto/include/dap_crypto_common.h +++ b/dap-sdk/crypto/include/dap_crypto_common.h @@ -34,11 +34,14 @@ extern "C" { #define OS_TARGET OS_LINUX #elif defined(__APPLE__) // MACOS #define OS_TARGET OS_MACOS +#elif defined (DAP_OS_BSD) + #define OS_TARGET_OS_BSD #else #error -- "Unsupported OS" #endif + // Definition of compiler #define COMPILER_VC 1 diff --git a/dap-sdk/crypto/src/msrln/AMD64/consts.c b/dap-sdk/crypto/src/msrln/AMD64/consts.c index 3ff24cbb00..9d45871ba4 100755 --- a/dap-sdk/crypto/src/msrln/AMD64/consts.c +++ b/dap-sdk/crypto/src/msrln/AMD64/consts.c @@ -1,40 +1,40 @@ -/**************************************************************************************** -* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* -* Abstract: constants for the x64 assembly implementation -* -*****************************************************************************************/ - -#include "../LatticeCrypto_priv.h" -#include <stdint.h> - - -uint32_t PRIME8x[8] = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q}; -uint8_t ONE32x[32] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; -uint32_t MASK12x8[8] = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff}; -uint32_t PERM0246[4] = {0,2,4,6}; -uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6}; -uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7}; -uint64_t PERM0145[4] = {0,1,4,5}; -uint64_t PERM2367[4] = {2,3,6,7}; -uint64_t MASK32[4] = {0xffffffff,0,0xffffffff,0}; -uint64_t MASK42[4] = {0x3fff0000000,0,0x3fff0000000,0}; - -uint64_t MASK14_1[4] = {0x3fff,0,0x3fff,0}; -uint64_t MASK14_2[4] = {0xFFFC000,0,0xFFFC000,0}; -uint64_t MASK14_3[4] = {0x3FFF0000000,0,0x3FFF0000000,0}; -uint64_t MASK14_4[4] = {0xFFFC0000000000,0,0xFFFC0000000000,0}; - -uint32_t ONE8x[8] = {1,1,1,1,1,1,1,1}; -uint32_t THREE8x[8] = {3,3,3,3,3,3,3,3}; -uint32_t FOUR8x[8] = {4,4,4,4,4,4,4,4}; -uint32_t PARAM_Q4x8[8] = {3073,3073,3073,3073,3073,3073,3073,3073}; -uint32_t PARAM_3Q4x8[8] = {9217,9217,9217,9217,9217,9217,9217,9217}; -uint32_t PARAM_5Q4x8[8] = {15362,15362,15362,15362,15362,15362,15362,15362}; -uint32_t PARAM_7Q4x8[8] = {21506,21506,21506,21506,21506,21506,21506,21506}; -uint32_t PARAM_Q2x8[8] = {6145,6145,6145,6145,6145,6145,6145,6145}; -uint32_t PARAM_3Q2x8[8] = {18434,18434,18434,18434,18434,18434,18434,18434}; - +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: constants for the x64 assembly implementation +* +*****************************************************************************************/ + +#include "../LatticeCrypto_priv.h" +#include <stdint.h> + + +uint32_t PRIME8x[8] = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q}; +uint8_t ONE32x[32] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; +uint32_t MASK12x8[8] = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff}; +uint32_t PERM0246[4] = {0,2,4,6}; +uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6}; +uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7}; +uint64_t PERM0145[4] = {0,1,4,5}; +uint64_t PERM2367[4] = {2,3,6,7}; +uint64_t MASK32[4] = {0xffffffff,0,0xffffffff,0}; +uint64_t MASK42[4] = {0x3fff0000000,0,0x3fff0000000,0}; + +uint64_t MASK14_1[4] = {0x3fff,0,0x3fff,0}; +uint64_t MASK14_2[4] = {0xFFFC000,0,0xFFFC000,0}; +uint64_t MASK14_3[4] = {0x3FFF0000000,0,0x3FFF0000000,0}; +uint64_t MASK14_4[4] = {0xFFFC0000000000,0,0xFFFC0000000000,0}; + +uint32_t ONE8x[8] = {1,1,1,1,1,1,1,1}; +uint32_t THREE8x[8] = {3,3,3,3,3,3,3,3}; +uint32_t FOUR8x[8] = {4,4,4,4,4,4,4,4}; +uint32_t PARAM_Q4x8[8] = {3073,3073,3073,3073,3073,3073,3073,3073}; +uint32_t PARAM_3Q4x8[8] = {9217,9217,9217,9217,9217,9217,9217,9217}; +uint32_t PARAM_5Q4x8[8] = {15362,15362,15362,15362,15362,15362,15362,15362}; +uint32_t PARAM_7Q4x8[8] = {21506,21506,21506,21506,21506,21506,21506,21506}; +uint32_t PARAM_Q2x8[8] = {6145,6145,6145,6145,6145,6145,6145,6145}; +uint32_t PARAM_3Q2x8[8] = {18434,18434,18434,18434,18434,18434,18434,18434}; + diff --git a/dap-sdk/crypto/src/msrln/AMD64/error_asm.S b/dap-sdk/crypto/src/msrln/AMD64/error_asm.S index 828816af04..836e47d8d7 100755 --- a/dap-sdk/crypto/src/msrln/AMD64/error_asm.S +++ b/dap-sdk/crypto/src/msrln/AMD64/error_asm.S @@ -1,436 +1,436 @@ -//**************************************************************************************** -// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 -// vector instructions for Linux -// -//**************************************************************************************** - -.intel_syntax noprefix - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx -#define reg_p4 rcx -#define reg_p5 r8 - - -.text -//*********************************************************************** -// Error sampling from psi_12 -// Operation: c [reg_p2] <- sampling(a) [reg_p1] -//*********************************************************************** -.global error_sampling_asm -error_sampling_asm: - vmovdqu ymm7, ONE32x - movq r11, 384 - movq r10, 32 - movq r8, 24 - xor rax, rax - xor rcx, rcx -loop1: - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // sample - vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+32] // sample - vmovdqu ymm4, YMMWORD PTR [reg_p1+4*rax+64] // sample - movq r9, 2 - -loop1b: - vpand ymm1, ymm0, ymm7 // Collecting 8 bits for first sample - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 - vpand ymm3, ymm0, ymm7 - vpaddb ymm1, ymm1, ymm3 - - vpand ymm3, ymm2, ymm7 // Adding next 4 bits - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm2, ymm2, 1 - vpand ymm3, ymm2, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm2, ymm2, 1 - vpand ymm3, ymm2, ymm7 - vpaddb ymm1, ymm1, ymm3 - vpsrlw ymm2, ymm2, 1 - vpand ymm3, ymm2, ymm7 - vpaddb ymm1, ymm1, ymm3 - - vpsrlw ymm2, ymm2, 1 // Collecting 4-bits for second sample - vpand ymm5, ymm2, ymm7 - vpsrlw ymm2, ymm2, 1 - vpand ymm3, ymm2, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm2, ymm2, 1 - vpand ymm3, ymm2, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm2, ymm2, 1 - vpand ymm3, ymm2, ymm7 - vpaddb ymm5, ymm5, ymm3 - - vpand ymm3, ymm4, ymm7 // Adding next 8 bits - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - vpsrlw ymm4, ymm4, 1 - vpand ymm3, ymm4, ymm7 - vpaddb ymm5, ymm5, ymm3 - - vpsubb ymm5, ymm1, ymm5 - vpermq ymm3, ymm5, 0x0e - vpmovsxbd ymm6, xmm5 - vpsrldq ymm5, ymm5, 8 - vpmovsxbd ymm7, xmm5 - vpmovsxbd ymm8, xmm3 - vpsrldq ymm3, ymm3, 8 - vpmovsxbd ymm9, xmm3 - vmovdqu YMMWORD PTR [reg_p2+4*rcx], ymm6 - vmovdqu YMMWORD PTR [reg_p2+4*rcx+32], ymm7 - vmovdqu YMMWORD PTR [reg_p2+4*rcx+64], ymm8 - vmovdqu YMMWORD PTR [reg_p2+4*rcx+96], ymm9 - - add rcx, r10 // i+32 - vpsrlw ymm0, ymm0, 1 - vpsrlw ymm2, ymm2, 1 - vpsrlw ymm4, ymm4, 1 - dec r9 - jnz loop1b - - add rax, r8 // j+24 - cmp rax, r11 - jl loop1 - ret - - -//*********************************************************************** -// Reconciliation helper function -// Operation: c [reg_p2] <- function(a) [reg_p1] -// [reg_p3] points to random bits -//*********************************************************************** -.global helprec_asm -helprec_asm: - vmovdqu ymm8, ONE8x - movq r11, 256 - movq r10, 8 - xor rax, rax - vmovdqu ymm4, YMMWORD PTR [reg_p3] // rbits -loop2: - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // x - vmovdqu ymm1, YMMWORD PTR [reg_p1+4*rax+4*256] // x+256 - vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+4*512] // x+512 - vmovdqu ymm3, YMMWORD PTR [reg_p1+4*rax+4*768] // x+768 - - vpand ymm5, ymm4, ymm8 // Collecting 8 random bits - vpslld ymm0, ymm0, 1 // 2*x - rbits - vpslld ymm1, ymm1, 1 - vpslld ymm2, ymm2, 1 - vpslld ymm3, ymm3, 1 - vpsubd ymm0, ymm0, ymm5 - vpsubd ymm1, ymm1, ymm5 - vpsubd ymm2, ymm2, ymm5 - vpsubd ymm3, ymm3, ymm5 - - vmovdqu ymm15, PARAM_Q4x8 - vmovdqu ymm7, FOUR8x - vmovdqu ymm8, ymm7 - vmovdqu ymm9, ymm7 - vmovdqu ymm10, ymm7 - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm7, ymm7, ymm6 - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm8, ymm8, ymm6 - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm9, ymm9, ymm6 - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm10, ymm10, ymm6 - vmovdqu ymm15, PARAM_3Q4x8 - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm7, ymm7, ymm6 - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm8, ymm8, ymm6 - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm9, ymm9, ymm6 - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm10, ymm10, ymm6 - vmovdqu ymm15, PARAM_5Q4x8 - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm7, ymm7, ymm6 - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm8, ymm8, ymm6 - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm9, ymm9, ymm6 - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm10, ymm10, ymm6 - vmovdqu ymm15, PARAM_7Q4x8 - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm7, ymm7, ymm6 // v0[0] - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm8, ymm8, ymm6 // v0[1] - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm9, ymm9, ymm6 // v0[2] - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm10, ymm10, ymm6 // v0[3] - - vmovdqu ymm15, PARAM_Q2x8 - vmovdqu ymm11, THREE8x - vmovdqu ymm12, ymm11 - vmovdqu ymm13, ymm11 - vmovdqu ymm14, ymm11 - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm11, ymm11, ymm6 - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm12, ymm12, ymm6 - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm13, ymm13, ymm6 - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm14, ymm14, ymm6 - vmovdqu ymm15, PARAM_3Q2x8 - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm11, ymm11, ymm6 - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm12, ymm12, ymm6 - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm13, ymm13, ymm6 - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm14, ymm14, ymm6 - vmovdqu ymm15, PRIME8x - vpsubd ymm6, ymm0, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm11, ymm11, ymm6 // v1[0] - vpsubd ymm6, ymm1, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm12, ymm12, ymm6 // v1[1] - vpsubd ymm6, ymm2, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm13, ymm13, ymm6 // v1[2] - vpsubd ymm6, ymm3, ymm15 - vpsrld ymm6, ymm6, 31 - vpsubd ymm14, ymm14, ymm6 // v1[3] - - vpmulld ymm6, ymm7, ymm15 - vpslld ymm0, ymm0, 1 - vpsubd ymm0, ymm0, ymm6 - vpabsd ymm0, ymm0 - vpmulld ymm6, ymm8, ymm15 - vpslld ymm1, ymm1, 1 - vpsubd ymm1, ymm1, ymm6 - vpabsd ymm1, ymm1 - vpaddd ymm0, ymm0, ymm1 - vpmulld ymm6, ymm9, ymm15 - vpslld ymm2, ymm2, 1 - vpsubd ymm2, ymm2, ymm6 - vpabsd ymm2, ymm2 - vpaddd ymm0, ymm0, ymm2 - vpmulld ymm6, ymm10, ymm15 - vpslld ymm3, ymm3, 1 - vpsubd ymm3, ymm3, ymm6 - vpabsd ymm3, ymm3 - vpaddd ymm0, ymm0, ymm3 // norm - vpsubd ymm0, ymm0, ymm15 - vpsrad ymm0, ymm0, 31 // If norm < q then norm = 0xff...ff, else norm = 0 - - vpxor ymm7, ymm7, ymm11 // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i] - vpand ymm7, ymm7, ymm0 - vpxor ymm7, ymm7, ymm11 - vpxor ymm8, ymm8, ymm12 - vpand ymm8, ymm8, ymm0 - vpxor ymm8, ymm8, ymm12 - vpxor ymm9, ymm9, ymm13 - vpand ymm9, ymm9, ymm0 - vpxor ymm9, ymm9, ymm13 - vpxor ymm10, ymm10, ymm14 - vpand ymm10, ymm10, ymm0 - vpxor ymm10, ymm10, ymm14 - - vmovdqu ymm15, THREE8x - vmovdqu ymm14, ONE8x - vpsubd ymm7, ymm7, ymm10 - vpand ymm7, ymm7, ymm15 - vpsubd ymm8, ymm8, ymm10 - vpand ymm8, ymm8, ymm15 - vpsubd ymm9, ymm9, ymm10 - vpand ymm9, ymm9, ymm15 - vpslld ymm10, ymm10, 1 - vpxor ymm0, ymm0, ymm14 - vpand ymm0, ymm0, ymm14 - vpaddd ymm10, ymm0, ymm10 - vpand ymm10, ymm10, ymm15 - - vpsrld ymm4, ymm4, 1 - vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm7 - vmovdqu YMMWORD PTR [reg_p2+4*rax+4*256], ymm8 - vmovdqu YMMWORD PTR [reg_p2+4*rax+4*512], ymm9 - vmovdqu YMMWORD PTR [reg_p2+4*rax+4*768], ymm10 - - add rax, r10 // j+8 - add rcx, r9 - cmp rax, r11 - jl loop2 - ret - - -//*********************************************************************** -// Reconciliation function -// Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2]) -//*********************************************************************** -.global rec_asm -rec_asm: - vpxor ymm12, ymm12, ymm12 - vmovdqu ymm15, PRIME8x - vpslld ymm14, ymm15, 2 // 4*Q - vpslld ymm13, ymm15, 3 // 8*Q - vpsubd ymm12, ymm12, ymm13 // -8*Q - vpxor ymm11, ymm12, ymm13 // 8*Q ^ -8*Q - vmovdqu ymm10, ONE8x - movq r11, 256 - movq r10, 8 - xor rax, rax - xor rcx, rcx -loop3: - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // x - vmovdqu ymm1, YMMWORD PTR [reg_p1+4*rax+4*256] // x+256 - vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+4*512] // x+512 - vmovdqu ymm3, YMMWORD PTR [reg_p1+4*rax+4*768] // x+768 - vmovdqu ymm4, YMMWORD PTR [reg_p2+4*rax] // rvec - vmovdqu ymm5, YMMWORD PTR [reg_p2+4*rax+4*256] // rvec+256 - vmovdqu ymm6, YMMWORD PTR [reg_p2+4*rax+4*512] // rvec+512 - vmovdqu ymm7, YMMWORD PTR [reg_p2+4*rax+4*768] // rvec+768 - - vpslld ymm8, ymm4, 1 // 2*rvec + rvec - vpaddd ymm4, ymm7, ymm8 - vpslld ymm8, ymm5, 1 - vpaddd ymm5, ymm7, ymm8 - vpslld ymm8, ymm6, 1 - vpaddd ymm6, ymm7, ymm8 - vpmulld ymm4, ymm4, ymm15 - vpmulld ymm5, ymm5, ymm15 - vpmulld ymm6, ymm6, ymm15 - vpmulld ymm7, ymm7, ymm15 - vpslld ymm0, ymm0, 3 // 8*x - vpslld ymm1, ymm1, 3 - vpslld ymm2, ymm2, 3 - vpslld ymm3, ymm3, 3 - vpsubd ymm0, ymm0, ymm4 // t[i] - vpsubd ymm1, ymm1, ymm5 - vpsubd ymm2, ymm2, ymm6 - vpsubd ymm3, ymm3, ymm7 - - vpsrad ymm8, ymm0, 31 // mask1 - vpabsd ymm4, ymm0 - vpsubd ymm4, ymm14, ymm4 - vpsrad ymm4, ymm4, 31 // mask2 - vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q - vpxor ymm8, ymm8, ymm12 - vpand ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymm4 - vpabsd ymm0, ymm0 - vpsrad ymm8, ymm1, 31 // mask1 - vpabsd ymm4, ymm1 - vpsubd ymm4, ymm14, ymm4 - vpsrad ymm4, ymm4, 31 // mask2 - vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q - vpxor ymm8, ymm8, ymm12 - vpand ymm4, ymm4, ymm8 - vpaddd ymm1, ymm1, ymm4 - vpabsd ymm1, ymm1 - vpaddd ymm0, ymm0, ymm1 - vpsrad ymm8, ymm2, 31 // mask1 - vpabsd ymm4, ymm2 - vpsubd ymm4, ymm14, ymm4 - vpsrad ymm4, ymm4, 31 // mask2 - vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q - vpxor ymm8, ymm8, ymm12 - vpand ymm4, ymm4, ymm8 - vpaddd ymm2, ymm2, ymm4 - vpabsd ymm2, ymm2 - vpaddd ymm0, ymm0, ymm2 - vpsrad ymm8, ymm3, 31 // mask1 - vpabsd ymm4, ymm3 - vpsubd ymm4, ymm14, ymm4 - vpsrad ymm4, ymm4, 31 // mask2 - vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q - vpxor ymm8, ymm8, ymm12 - vpand ymm4, ymm4, ymm8 - vpaddd ymm3, ymm3, ymm4 - vpabsd ymm3, ymm3 - vpaddd ymm0, ymm0, ymm3 // norm - - vpsubd ymm0, ymm13, ymm0 // If norm < PARAMETER_Q then result = 1, else result = 0 - vpsrld ymm0, ymm0, 31 - vpxor ymm0, ymm0, ymm10 - - vpsrlq ymm1, ymm0, 31 - vpor ymm1, ymm0, ymm1 - vpsllq ymm2, ymm1, 2 - vpsrldq ymm2, ymm2, 8 - vpor ymm1, ymm2, ymm1 - vpsllq ymm2, ymm1, 4 - vpermq ymm2, ymm2, 0x56 - vpor ymm0, ymm1, ymm2 - vmovq r9, xmm0 - - mov BYTE PTR [reg_p3+rcx], r9b - - add rax, r10 // j+8 - inc rcx - cmp rax, r11 - jl loop3 - ret +//**************************************************************************************** +// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// Abstract: functions for error sampling and reconciliation in x64 assembly using AVX2 +// vector instructions for Linux +// +//**************************************************************************************** + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx +#define reg_p4 rcx +#define reg_p5 r8 + + +.text +//*********************************************************************** +// Error sampling from psi_12 +// Operation: c [reg_p2] <- sampling(a) [reg_p1] +//*********************************************************************** +.global error_sampling_asm +error_sampling_asm: + vmovdqu ymm7, ONE32x + movq r11, 384 + movq r10, 32 + movq r8, 24 + xor rax, rax + xor rcx, rcx +loop1: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // sample + vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+32] // sample + vmovdqu ymm4, YMMWORD PTR [reg_p1+4*rax+64] // sample + movq r9, 2 + +loop1b: + vpand ymm1, ymm0, ymm7 // Collecting 8 bits for first sample + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpand ymm3, ymm0, ymm7 + vpaddb ymm1, ymm1, ymm3 + + vpand ymm3, ymm2, ymm7 // Adding next 4 bits + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm1, ymm1, ymm3 + + vpsrlw ymm2, ymm2, 1 // Collecting 4-bits for second sample + vpand ymm5, ymm2, ymm7 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm2, ymm2, 1 + vpand ymm3, ymm2, ymm7 + vpaddb ymm5, ymm5, ymm3 + + vpand ymm3, ymm4, ymm7 // Adding next 8 bits + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + vpsrlw ymm4, ymm4, 1 + vpand ymm3, ymm4, ymm7 + vpaddb ymm5, ymm5, ymm3 + + vpsubb ymm5, ymm1, ymm5 + vpermq ymm3, ymm5, 0x0e + vpmovsxbd ymm6, xmm5 + vpsrldq ymm5, ymm5, 8 + vpmovsxbd ymm7, xmm5 + vpmovsxbd ymm8, xmm3 + vpsrldq ymm3, ymm3, 8 + vpmovsxbd ymm9, xmm3 + vmovdqu YMMWORD PTR [reg_p2+4*rcx], ymm6 + vmovdqu YMMWORD PTR [reg_p2+4*rcx+32], ymm7 + vmovdqu YMMWORD PTR [reg_p2+4*rcx+64], ymm8 + vmovdqu YMMWORD PTR [reg_p2+4*rcx+96], ymm9 + + add rcx, r10 // i+32 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm4, ymm4, 1 + dec r9 + jnz loop1b + + add rax, r8 // j+24 + cmp rax, r11 + jl loop1 + ret + + +//*********************************************************************** +// Reconciliation helper function +// Operation: c [reg_p2] <- function(a) [reg_p1] +// [reg_p3] points to random bits +//*********************************************************************** +.global helprec_asm +helprec_asm: + vmovdqu ymm8, ONE8x + movq r11, 256 + movq r10, 8 + xor rax, rax + vmovdqu ymm4, YMMWORD PTR [reg_p3] // rbits +loop2: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // x + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*rax+4*256] // x+256 + vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+4*512] // x+512 + vmovdqu ymm3, YMMWORD PTR [reg_p1+4*rax+4*768] // x+768 + + vpand ymm5, ymm4, ymm8 // Collecting 8 random bits + vpslld ymm0, ymm0, 1 // 2*x - rbits + vpslld ymm1, ymm1, 1 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm0, ymm0, ymm5 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm5 + vpsubd ymm3, ymm3, ymm5 + + vmovdqu ymm15, PARAM_Q4x8 + vmovdqu ymm7, FOUR8x + vmovdqu ymm8, ymm7 + vmovdqu ymm9, ymm7 + vmovdqu ymm10, ymm7 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 + vmovdqu ymm15, PARAM_3Q4x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 + vmovdqu ymm15, PARAM_5Q4x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 + vmovdqu ymm15, PARAM_7Q4x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm7, ymm7, ymm6 // v0[0] + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm8, ymm8, ymm6 // v0[1] + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm9, ymm9, ymm6 // v0[2] + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm10, ymm10, ymm6 // v0[3] + + vmovdqu ymm15, PARAM_Q2x8 + vmovdqu ymm11, THREE8x + vmovdqu ymm12, ymm11 + vmovdqu ymm13, ymm11 + vmovdqu ymm14, ymm11 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm11, ymm11, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm12, ymm12, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm13, ymm13, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm14, ymm14, ymm6 + vmovdqu ymm15, PARAM_3Q2x8 + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm11, ymm11, ymm6 + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm12, ymm12, ymm6 + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm13, ymm13, ymm6 + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm14, ymm14, ymm6 + vmovdqu ymm15, PRIME8x + vpsubd ymm6, ymm0, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm11, ymm11, ymm6 // v1[0] + vpsubd ymm6, ymm1, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm12, ymm12, ymm6 // v1[1] + vpsubd ymm6, ymm2, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm13, ymm13, ymm6 // v1[2] + vpsubd ymm6, ymm3, ymm15 + vpsrld ymm6, ymm6, 31 + vpsubd ymm14, ymm14, ymm6 // v1[3] + + vpmulld ymm6, ymm7, ymm15 + vpslld ymm0, ymm0, 1 + vpsubd ymm0, ymm0, ymm6 + vpabsd ymm0, ymm0 + vpmulld ymm6, ymm8, ymm15 + vpslld ymm1, ymm1, 1 + vpsubd ymm1, ymm1, ymm6 + vpabsd ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm6, ymm9, ymm15 + vpslld ymm2, ymm2, 1 + vpsubd ymm2, ymm2, ymm6 + vpabsd ymm2, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm6, ymm10, ymm15 + vpslld ymm3, ymm3, 1 + vpsubd ymm3, ymm3, ymm6 + vpabsd ymm3, ymm3 + vpaddd ymm0, ymm0, ymm3 // norm + vpsubd ymm0, ymm0, ymm15 + vpsrad ymm0, ymm0, 31 // If norm < q then norm = 0xff...ff, else norm = 0 + + vpxor ymm7, ymm7, ymm11 // v0[i] = (norm & (v0[i] ^ v1[i])) ^ v1[i] + vpand ymm7, ymm7, ymm0 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpand ymm8, ymm8, ymm0 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + vpand ymm9, ymm9, ymm0 + vpxor ymm9, ymm9, ymm13 + vpxor ymm10, ymm10, ymm14 + vpand ymm10, ymm10, ymm0 + vpxor ymm10, ymm10, ymm14 + + vmovdqu ymm15, THREE8x + vmovdqu ymm14, ONE8x + vpsubd ymm7, ymm7, ymm10 + vpand ymm7, ymm7, ymm15 + vpsubd ymm8, ymm8, ymm10 + vpand ymm8, ymm8, ymm15 + vpsubd ymm9, ymm9, ymm10 + vpand ymm9, ymm9, ymm15 + vpslld ymm10, ymm10, 1 + vpxor ymm0, ymm0, ymm14 + vpand ymm0, ymm0, ymm14 + vpaddd ymm10, ymm0, ymm10 + vpand ymm10, ymm10, ymm15 + + vpsrld ymm4, ymm4, 1 + vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm7 + vmovdqu YMMWORD PTR [reg_p2+4*rax+4*256], ymm8 + vmovdqu YMMWORD PTR [reg_p2+4*rax+4*512], ymm9 + vmovdqu YMMWORD PTR [reg_p2+4*rax+4*768], ymm10 + + add rax, r10 // j+8 + add rcx, r9 + cmp rax, r11 + jl loop2 + ret + + +//*********************************************************************** +// Reconciliation function +// Operation: c [reg_p3] <- function(a [reg_p1], b [reg_p2]) +//*********************************************************************** +.global rec_asm +rec_asm: + vpxor ymm12, ymm12, ymm12 + vmovdqu ymm15, PRIME8x + vpslld ymm14, ymm15, 2 // 4*Q + vpslld ymm13, ymm15, 3 // 8*Q + vpsubd ymm12, ymm12, ymm13 // -8*Q + vpxor ymm11, ymm12, ymm13 // 8*Q ^ -8*Q + vmovdqu ymm10, ONE8x + movq r11, 256 + movq r10, 8 + xor rax, rax + xor rcx, rcx +loop3: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // x + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*rax+4*256] // x+256 + vmovdqu ymm2, YMMWORD PTR [reg_p1+4*rax+4*512] // x+512 + vmovdqu ymm3, YMMWORD PTR [reg_p1+4*rax+4*768] // x+768 + vmovdqu ymm4, YMMWORD PTR [reg_p2+4*rax] // rvec + vmovdqu ymm5, YMMWORD PTR [reg_p2+4*rax+4*256] // rvec+256 + vmovdqu ymm6, YMMWORD PTR [reg_p2+4*rax+4*512] // rvec+512 + vmovdqu ymm7, YMMWORD PTR [reg_p2+4*rax+4*768] // rvec+768 + + vpslld ymm8, ymm4, 1 // 2*rvec + rvec + vpaddd ymm4, ymm7, ymm8 + vpslld ymm8, ymm5, 1 + vpaddd ymm5, ymm7, ymm8 + vpslld ymm8, ymm6, 1 + vpaddd ymm6, ymm7, ymm8 + vpmulld ymm4, ymm4, ymm15 + vpmulld ymm5, ymm5, ymm15 + vpmulld ymm6, ymm6, ymm15 + vpmulld ymm7, ymm7, ymm15 + vpslld ymm0, ymm0, 3 // 8*x + vpslld ymm1, ymm1, 3 + vpslld ymm2, ymm2, 3 + vpslld ymm3, ymm3, 3 + vpsubd ymm0, ymm0, ymm4 // t[i] + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + + vpsrad ymm8, ymm0, 31 // mask1 + vpabsd ymm4, ymm0 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymm4 + vpabsd ymm0, ymm0 + vpsrad ymm8, ymm1, 31 // mask1 + vpabsd ymm4, ymm1 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm1, ymm1, ymm4 + vpabsd ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpsrad ymm8, ymm2, 31 // mask1 + vpabsd ymm4, ymm2 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm2, ymm2, ymm4 + vpabsd ymm2, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpsrad ymm8, ymm3, 31 // mask1 + vpabsd ymm4, ymm3 + vpsubd ymm4, ymm14, ymm4 + vpsrad ymm4, ymm4, 31 // mask2 + vpand ymm8, ymm8, ymm11 // (mask1 & (8*PARAMETER_Q ^ -8*PARAMETER_Q)) ^ -8*PARAMETER_Q + vpxor ymm8, ymm8, ymm12 + vpand ymm4, ymm4, ymm8 + vpaddd ymm3, ymm3, ymm4 + vpabsd ymm3, ymm3 + vpaddd ymm0, ymm0, ymm3 // norm + + vpsubd ymm0, ymm13, ymm0 // If norm < PARAMETER_Q then result = 1, else result = 0 + vpsrld ymm0, ymm0, 31 + vpxor ymm0, ymm0, ymm10 + + vpsrlq ymm1, ymm0, 31 + vpor ymm1, ymm0, ymm1 + vpsllq ymm2, ymm1, 2 + vpsrldq ymm2, ymm2, 8 + vpor ymm1, ymm2, ymm1 + vpsllq ymm2, ymm1, 4 + vpermq ymm2, ymm2, 0x56 + vpor ymm0, ymm1, ymm2 + vmovq r9, xmm0 + + mov BYTE PTR [reg_p3+rcx], r9b + + add rax, r10 // j+8 + inc rcx + cmp rax, r11 + jl loop3 + ret diff --git a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c index ef846a4841..d39e95e779 100755 --- a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c +++ b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64.c @@ -1,65 +1,65 @@ -/**************************************************************************************** -* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* -* Abstract: NTT functions and other low-level operations -* -*****************************************************************************************/ - -#include "../LatticeCrypto_priv.h" - - -void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N) -{ - NTT_CT_std2rev_12289_asm(a, psi_rev, N); -} - - -void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N) -{ - INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N); -} - - -void two_reduce12289(int32_t* a, unsigned int N) -{ - two_reduce12289_asm(a, N); -} - - -void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N) -{ - pmul_asm(a, b, c, N); -} - - -void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N) -{ - pmuladd_asm(a, b, c, d, N); -} - - -void smul(int32_t* a, int32_t scalar, unsigned int N) -{ - unsigned int i; - - for (i = 0; i < N; i++) { - a[i] = a[i]*scalar; - } -} - - -void correction(int32_t* a, int32_t p, unsigned int N) -{ - unsigned int i; - int32_t mask; - - for (i = 0; i < N; i++) { - mask = a[i] >> (4*sizeof(int32_t) - 1); - a[i] += (p & mask) - p; - mask = a[i] >> (4*sizeof(int32_t) - 1); - a[i] += (p & mask); - } -} +/**************************************************************************************** +* LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: NTT functions and other low-level operations +* +*****************************************************************************************/ + +#include "../LatticeCrypto_priv.h" + + +void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N) +{ + NTT_CT_std2rev_12289_asm(a, psi_rev, N); +} + + +void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N) +{ + INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N); +} + + +void two_reduce12289(int32_t* a, unsigned int N) +{ + two_reduce12289_asm(a, N); +} + + +void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N) +{ + pmul_asm(a, b, c, N); +} + + +void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N) +{ + pmuladd_asm(a, b, c, d, N); +} + + +void smul(int32_t* a, int32_t scalar, unsigned int N) +{ + unsigned int i; + + for (i = 0; i < N; i++) { + a[i] = a[i]*scalar; + } +} + + +void correction(int32_t* a, int32_t p, unsigned int N) +{ + unsigned int i; + int32_t mask; + + for (i = 0; i < N; i++) { + mask = a[i] >> (4*sizeof(int32_t) - 1); + a[i] += (p & mask) - p; + mask = a[i] >> (4*sizeof(int32_t) - 1); + a[i] += (p & mask); + } +} diff --git a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S index e44c90dce0..9e8d89660a 100755 --- a/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S +++ b/dap-sdk/crypto/src/msrln/AMD64/ntt_x64_asm.S @@ -1,979 +1,979 @@ -//**************************************************************************************** -// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux -// -//**************************************************************************************** - -.intel_syntax noprefix - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx -#define reg_p4 rcx -#define reg_p5 r8 - - -.text -//*********************************************************************** -// Forward NTT -// Operation: a [reg_p1] <- NTT(a) [reg_p1], -// [reg_p2] points to table and -// reg_p3 contains parameter n -//*********************************************************************** -.global NTT_CT_std2rev_12289_asm -NTT_CT_std2rev_12289_asm: - push r12 - push r13 - push r14 - -// Stages m=1 -> m=32 - mov r9, 1 // m = 1 - mov rax, reg_p3 - mov r12, reg_p3 - shr r12, 4 // n/16 - vmovdqu ymm14, MASK12x8 - vmovdqu ymm12, PERM0246 - mov r14, 16 - mov rcx, 11 -loop1: - shr rax, 1 // k = k/2 - dec rcx - xor rdx, rdx // i = 0 -loop2: - mov r10, rdx - mov r11, rax - dec r11 - shl r10, cl // j1 - add r11, r10 // j2 - mov r13, r9 - add r13, rdx // m+i - vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13] // S - -loop3: - mov r13, r10 - add r13, rax // j+k - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r13] // a[j+k] - vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k] - vpmovsxdq ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k] - vpmovsxdq ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k] - - vpmuldq ymm1, ymm1, ymm11 // a[j+k].S - vpmuldq ymm3, ymm3, ymm11 - vpmuldq ymm5, ymm5, ymm11 - vpmuldq ymm7, ymm7, ymm11 - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - - vmovdqu ymm13, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm1, 1 // 2*c0 - vpsubd ymm13, ymm1, ymm13 // c0-c1 - vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 - vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm13 // a[j] = U + V - vpermd ymm1, ymm12, ymm1 - vpermd ymm0, ymm12, ymm0 - vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] - - vmovdqu ymm13, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm3, 1 // 2*c0 - vpsubd ymm13, ymm3, ymm13 // c0-c1 - vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 - vpsubd ymm3, ymm2, ymm13 // a[j+k] = U - V - vpaddd ymm2, ymm2, ymm13 // a[j] = U + V - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 - vpermd ymm3, ymm12, ymm3 - vpermd ymm2, ymm12, ymm2 - vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] - - vmovdqu ymm13, ymm5 - vpand ymm5, ymm14, ymm5 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm5, 1 // 2*c0 - vpsubd ymm13, ymm5, ymm13 // c0-c1 - vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 - vpsubd ymm5, ymm4, ymm13 // a[j+k] = U - V - vpaddd ymm4, ymm4, ymm13 // a[j] = U + V - vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 - vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 - vpermd ymm5, ymm12, ymm5 - vpermd ymm4, ymm12, ymm4 - vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] - - vmovdqu ymm13, ymm7 - vpand ymm7, ymm14, ymm7 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm7, 1 // 2*c0 - vpsubd ymm13, ymm7, ymm13 // c0-c1 - vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 - vpsubd ymm7, ymm6, ymm13 // a[j+k] = U - V - vpaddd ymm6, ymm6, ymm13 // a[j] = U + V - vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 - vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 - vpermd ymm6, ymm12, ymm6 - vpermd ymm7, ymm12, ymm7 - vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 - vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 - - add r10, r14 - cmp r10, r11 - jl loop3 - inc rdx - cmp rdx, r9 - jl loop2 - shl r9, 1 - cmp r9, r12 - jl loop1 - -// Stage m=64 - xor rdx, rdx // i = 0 - xor r10, r10 // j1 = 0 -loop4: - vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k] - vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k] - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] - vpmuldq ymm1, ymm1, ymm11 // a[j+k].S - vpmuldq ymm3, ymm3, ymm11 // a[j+k].S - - vmovdqu ymm13, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm1, 1 // 2*c0 - vpsubd ymm13, ymm1, ymm13 // c0-c1 - vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 - - vmovdqu ymm10, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm10, ymm10, 12 // c1 - vpslld ymm15, ymm3, 1 // 2*c0 - vpsubd ymm10, ymm3, ymm10 // c0-c1 - vpaddd ymm10, ymm10, ymm15 // V = 3*c0-c1 - - vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm13 // a[j] = U + V - vpsubd ymm3, ymm2, ymm10 // a[j+k] = U - V - vpaddd ymm2, ymm2, ymm10 // a[j] = U + V - - vpermd ymm0, ymm12, ymm0 - vpermd ymm1, ymm12, ymm1 - vpermd ymm2, ymm12, ymm2 - vpermd ymm3, ymm12, ymm3 - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 - vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 - vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 - - add r10, r14 // j+16 - inc rdx // i+1 - cmp rdx, r9 - jl loop4 - -// Stage m=128 - shl r9, 1 - xor rdx, rdx // i = 0 - xor r10, r10 // j1 = 0 - mov r13, 8 -loop6: - vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k] - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpmuldq ymm1, ymm1, ymm2 // a[j+k].S - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm14, ymm0 // c0 - vpsrad ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // U = 3*c0-c1 - - vmovdqu ymm3, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrlq ymm4, ymm3, 24 // c2 - vpsrad ymm3, ymm3, 12 // xc1 - vpand ymm3, ymm14, ymm3 // c1 - vpslld ymm5, ymm1, 3 // 8*c0 - vpaddd ymm4, ymm1, ymm4 // c0+c2 - vpaddd ymm4, ymm4, ymm5 // 9*c0+c2 - vpslld ymm5, ymm3, 1 // 2*c1 - vpaddd ymm1, ymm0, ymm3 // U+c1 - vpsubd ymm0, ymm0, ymm3 // U-c1 - vpsubd ymm4, ymm4, ymm5 // 9*c0-2*c1+c2 - vpaddd ymm0, ymm0, ymm4 // U+(9*c0-3*c1+c2) - vpsubd ymm1, ymm1, ymm4 // U-(9*c0-3*c1+c2) - vpermd ymm0, ymm12, ymm0 - vpermd ymm1, ymm12, ymm1 - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 - - add r10, r13 // j+8 - inc rdx // i+1 - cmp rdx, r9 - jl loop6 - -// Stage m=256 - vmovdqu ymm9, PERM02134657 - shl r9, 1 - xor rdx, rdx // i = 0 - xor r10, r10 // j1 = 0 - mov r14, 32 -loop7: - vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256] // S = psi[m+i]->psi[m+i+3] - vpermq ymm8, ymm2, 0x50 - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+3] - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3] - vpermq ymm3, ymm0, 0x4e - vinserti128 ymm0, ymm0, xmm1, 1 // U - vpblendd ymm1, ymm1, ymm3, 15 - vpmuldq ymm3, ymm1, ymm8 // a[j+k].S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 - vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm4 // a[j] = U + V - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 - - vpermq ymm8, ymm2, 0xfa - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3] - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3] - vpermq ymm3, ymm0, 0x4e - vinserti128 ymm0, ymm0, xmm1, 1 // U - vpblendd ymm1, ymm1, ymm3, 15 - vpmuldq ymm3, ymm1, ymm8 // a[j+k].S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 - vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm4 // a[j] = U + V - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 - - vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16] // S = psi[m+i]->psi[m+i+3] - vpermq ymm8, ymm2, 0x50 - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3] - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3] - vpermq ymm3, ymm0, 0x4e - vinserti128 ymm0, ymm0, xmm1, 1 // U - vpblendd ymm1, ymm1, ymm3, 15 - vpmuldq ymm3, ymm1, ymm8 // a[j+k].S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 - vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm4 // a[j] = U + V - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 - - vpermq ymm8, ymm2, 0xfa - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+3] - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3] - vpermq ymm3, ymm0, 0x4e - vinserti128 ymm0, ymm0, xmm1, 1 // U - vpblendd ymm1, ymm1, ymm3, 15 - vpmuldq ymm3, ymm1, ymm8 // a[j+k].S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 - vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm4 // a[j] = U + V - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 - - add r10, r14 // j+32 - add rdx, r13 // i+8 - cmp rdx, r9 - jl loop7 - -// Stage m=512 - vmovdqu ymm9, PERM00224466 - shl r9, 1 // m = n/2 - xor rdx, rdx // i = 0 - xor r10, r10 // j1 = 0 - mov r14, 4 -loop8: - vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] - vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // a[j+k] - vpmuldq ymm3, ymm1, ymm2 // a[j+k].S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 - vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V - vpaddd ymm0, ymm0, ymm4 // a[j] = U + V - vpermd ymm1, ymm9, ymm1 - vpblendd ymm0, ymm0, ymm1, 0xaa - vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 - - add r10, r13 // j+8 - add rdx, r14 // i+4 - cmp rdx, r9 - jl loop8 - - pop r14 - pop r13 - pop r12 - ret - - -//*********************************************************************** -// Inverse NTT -// Operation: a [reg_p1] <- INTT(a) [reg_p1], -// [reg_p2] points to table -// reg_p3 and reg_p4 point to constants for scaling and -// reg_p5 contains parameter n -//*********************************************************************** -.global INTT_GS_rev2std_12289_asm -INTT_GS_rev2std_12289_asm: - push r12 - push r13 - push r14 - push r15 - push rbx - -// Stage m=1024 - vmovdqu ymm9, PERM00224466 - vmovdqu ymm14, MASK12x8 - mov r12, reg_p5 - shr r12, 1 // n/2 = 512 - xor r15, r15 // i = 0 - xor r10, r10 // j1 = 0 - mov r13, 8 - mov r14, 4 -loop1b: - vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // V = a[j+k] - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*512] // S - vpsubd ymm3, ymm0, ymm1 // U - V - vpaddd ymm0, ymm0, ymm1 // U + V - vpmuldq ymm3, ymm3, ymm2 // (U - V).S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 - vpermd ymm1, ymm9, ymm1 - vpblendd ymm0, ymm0, ymm1, 0xaa - vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 - - add r10, r13 // j+8 - add r15, r14 // i+4 - cmp r15, r12 - jl loop1b - -// Stage m=512 - vmovdqu ymm9, PERM02134657 - vmovdqu ymm13, PERM0145 - vmovdqu ymm15, PERM2367 - shr r12, 1 // n/4 = 256 - xor r15, r15 // i = 0 - xor r10, r10 // j1 = 0 - mov r14, 32 -loop2b: - vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256] // S = psi[m+i]->psi[m+i+3] - vpermq ymm8, ymm2, 0x50 - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+7] - vpermd ymm1, ymm15, ymm0 - vpermd ymm0, ymm13, ymm0 - vpsubd ymm3, ymm0, ymm1 // U - V - vpaddd ymm0, ymm0, ymm1 // U + V - vpmuldq ymm3, ymm3, ymm8 // (U - V).S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 - - vpermq ymm8, ymm2, 0xfa - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+7] - vpermd ymm1, ymm15, ymm0 - vpermd ymm0, ymm13, ymm0 - vpsubd ymm3, ymm0, ymm1 // U - V - vpaddd ymm0, ymm0, ymm1 // U + V - vpmuldq ymm3, ymm3, ymm8 // (U - V).S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 - - vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] - vpermq ymm8, ymm2, 0x50 - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+7] - vpermd ymm1, ymm15, ymm0 - vpermd ymm0, ymm13, ymm0 - vpsubd ymm3, ymm0, ymm1 // U - V - vpaddd ymm0, ymm0, ymm1 // U + V - vpmuldq ymm3, ymm3, ymm8 // (U - V).S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 - - vpermq ymm8, ymm2, 0xfa - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+7] - vpermd ymm1, ymm15, ymm0 - vpermd ymm0, ymm13, ymm0 - vpsubd ymm3, ymm0, ymm1 // U - V - vpaddd ymm0, ymm0, ymm1 // U + V - vpmuldq ymm3, ymm3, ymm8 // (U - V).S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 - vpslldq ymm1, ymm1, 4 - vpblendd ymm0, ymm0, ymm1, 0xaa - vpermd ymm0, ymm9, ymm0 - vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 - - add r10, r14 // j+32 - add r15, r13 // i+8 - cmp r15, r12 - jl loop2b - -// Stage m=256 - vmovdqu ymm12, PERM0246 - shr r12, 1 // n/8 = 128 - xor r15, r15 // i = 0 - xor r10, r10 // j1 = 0 -loop3b: - vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128] // S - vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // V = a[j+k] - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpsubd ymm3, ymm0, ymm1 // U - V - vpaddd ymm0, ymm0, ymm1 // U + V - vpmuldq ymm3, ymm3, ymm2 // (U - V).S - vmovdqu ymm4, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm4, ymm4, 12 // c1 - vpslld ymm5, ymm3, 1 // 2*c0 - vpsubd ymm4, ymm3, ymm4 // c0-c1 - vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 - vpermd ymm0, ymm12, ymm0 - vpermd ymm1, ymm12, ymm1 - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 - - add r10, r13 // j+8 - inc r15 // i+1 - cmp r15, r12 - jl loop3b - -// Stage m=128 - shr r12, 1 // n/16 = 64 - xor r15, r15 // i = 0 - xor r10, r10 // j1 = 0 - mov r14, 16 -loop4b: - vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64] // S - vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+32] // V = a[j+k] - vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r10+48] // V = a[j+k] - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] - vpsubd ymm1, ymm0, ymm13 // U - V - vpaddd ymm0, ymm0, ymm13 // U + V - vpsubd ymm3, ymm2, ymm15 // U - V - vpaddd ymm2, ymm2, ymm15 // U + V - vpmuldq ymm1, ymm1, ymm11 // (U - V).S - vpmuldq ymm3, ymm3, ymm11 // (U - V).S - - vmovdqu ymm13, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm1, 1 // 2*c0 - vpsubd ymm13, ymm1, ymm13 // c0-c1 - vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 - - vmovdqu ymm13, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm3, 1 // 2*c0 - vpsubd ymm13, ymm3, ymm13 // c0-c1 - vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 - - vpermd ymm0, ymm12, ymm0 - vpermd ymm1, ymm12, ymm1 - vpermd ymm2, ymm12, ymm2 - vpermd ymm3, ymm12, ymm3 - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 - vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 - vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 - - add r10, r14 // j+16 - inc r15 // i+1 - cmp r15, r12 - jl loop4b - -// Stages m=64 -> m=4 - mov r9, 5 // 5 iterations - mov rax, 8 -loop5b: - shl rax, 1 // k = 2*k - shr r12, 1 // m/2 - xor r15, r15 // i = 0 - xor r8, r8 -loop6b: - mov r10, r8 // Load j1 - mov r11, rax - dec r11 - add r11, r10 // j2 - mov r13, r12 - add r13, r15 // m/2+i - vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13] // S - mov rbx, 4 - -loop7b: - mov r13, r10 - add r13, rax // j+k - vpmovsxdq ymm10, XMMWORD PTR [reg_p1+4*r13] // V = a[j+k] - vpmovsxdq ymm11, XMMWORD PTR [reg_p1+4*r13+16] // V = a[j+k] - vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r13+32] // V = a[j+k] - vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r13+48] // V = a[j+k] - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] - vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] - vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] - - vpsubd ymm1, ymm0, ymm10 // U - V - vpaddd ymm0, ymm0, ymm10 // U + V - vpsubd ymm3, ymm2, ymm11 // U - V - vpaddd ymm2, ymm2, ymm11 // U + V - vpsubd ymm5, ymm4, ymm13 // U - V - vpaddd ymm4, ymm4, ymm13 // U + V - vpsubd ymm7, ymm6, ymm15 // U - V - vpaddd ymm6, ymm6, ymm15 // U + V - - vpmuldq ymm1, ymm1, ymm9 // (U - V).S - vpmuldq ymm3, ymm3, ymm9 - vpmuldq ymm5, ymm5, ymm9 - vpmuldq ymm7, ymm7, ymm9 - - vmovdqu ymm13, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm1, 1 // 2*c0 - vpsubd ymm13, ymm1, ymm13 // c0-c1 - vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 - - cmp r9, rbx - jne skip1 - vmovdqu ymm13, ymm0 - vpand ymm0, ymm14, ymm0 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm0, 1 // 2*c0 - vpsubd ymm13, ymm0, ymm13 // c0-c1 - vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 - - vmovdqu ymm13, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm1, 1 // 2*c0 - vpsubd ymm13, ymm1, ymm13 // c0-c1 - vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 -skip1: - vpermd ymm1, ymm12, ymm1 - vpermd ymm0, ymm12, ymm0 - - vmovdqu ymm13, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm3, 1 // 2*c0 - vpsubd ymm13, ymm3, ymm13 // c0-c1 - vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 - - cmp r9, rbx - jne skip2 - vmovdqu ymm13, ymm2 - vpand ymm2, ymm14, ymm2 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm2, 1 // 2*c0 - vpsubd ymm13, ymm2, ymm13 // c0-c1 - vpaddd ymm2, ymm13, ymm15 // 3*c0-c1 - - vmovdqu ymm13, ymm3 - vpand ymm3, ymm14, ymm3 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm3, 1 // 2*c0 - vpsubd ymm13, ymm3, ymm13 // c0-c1 - vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 -skip2: - vpermd ymm3, ymm12, ymm3 - vpermd ymm2, ymm12, ymm2 - - vmovdqu ymm13, ymm5 - vpand ymm5, ymm14, ymm5 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm5, 1 // 2*c0 - vpsubd ymm13, ymm5, ymm13 // c0-c1 - vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 - vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 - vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 - - cmp r9, rbx - jne skip3 - vmovdqu ymm13, ymm4 - vpand ymm4, ymm14, ymm4 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm4, 1 // 2*c0 - vpsubd ymm13, ymm4, ymm13 // c0-c1 - vpaddd ymm4, ymm13, ymm15 // 3*c0-c1 - - vmovdqu ymm13, ymm5 - vpand ymm5, ymm14, ymm5 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm5, 1 // 2*c0 - vpsubd ymm13, ymm5, ymm13 // c0-c1 - vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 -skip3: - vpermd ymm5, ymm12, ymm5 - vpermd ymm4, ymm12, ymm4 - - vmovdqu ymm13, ymm7 - vpand ymm7, ymm14, ymm7 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm7, 1 // 2*c0 - vpsubd ymm13, ymm7, ymm13 // c0-c1 - vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 - vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 - vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 - - cmp r9, rbx - jne skip4 - vmovdqu ymm13, ymm6 - vpand ymm6, ymm14, ymm6 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm6, 1 // 2*c0 - vpsubd ymm13, ymm6, ymm13 // c0-c1 - vpaddd ymm6, ymm13, ymm15 // 3*c0-c1 - - vmovdqu ymm13, ymm7 - vpand ymm7, ymm14, ymm7 // c0 - vpsrad ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm7, 1 // 2*c0 - vpsubd ymm13, ymm7, ymm13 // c0-c1 - vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 -skip4: - vpermd ymm7, ymm12, ymm7 - vpermd ymm6, ymm12, ymm6 - vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 - vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 - - add r10, r14 - cmp r10, r11 - jl loop7b - mov rbx, rax - shl rbx, 1 // 2*k - add r8, rbx // j1+2*k - inc r15 - cmp r15, r12 - jl loop6b - dec r9 - jnz loop5b - -// Scaling step - shl rax, 1 // k = 2*k = 512 - xor r10, r10 // j = 0 - mov r14, 4 - movq xmm0, reg_p3 - vbroadcastsd ymm10, xmm0 // S = omegainv1N_rev - movq xmm0, reg_p4 - vbroadcastsd ymm11, xmm0 // T = Ninv -loop8b: - vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+4*512] // V = a[j+k] - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] - vpsubd ymm1, ymm0, ymm13 // U - V - vpaddd ymm0, ymm0, ymm13 // U + V - vpmuldq ymm1, ymm1, ymm10 // (U - V).S - vpmuldq ymm0, ymm0, ymm11 // (U + V).T - - vmovdqu ymm13, ymm0 - vpand ymm0, ymm14, ymm0 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm0, 1 // 2*c0 - vpsubd ymm13, ymm0, ymm13 // c0-c1 - vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 - - vmovdqu ymm13, ymm1 - vpand ymm1, ymm14, ymm1 // c0 - vpsrlq ymm13, ymm13, 12 // c1 - vpslld ymm15, ymm1, 1 // 2*c0 - vpsubd ymm13, ymm1, ymm13 // c0-c1 - vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 - - vpermd ymm0, ymm12, ymm0 - vpermd ymm1, ymm12, ymm1 - vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 - vmovdqu XMMWORD PTR [reg_p1+4*r10+4*512], xmm1 - - add r10, r14 // j+4 - cmp r10, rax - jl loop8b -loop9b: - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -//*********************************************************************** -// Component-wise multiplication and addition -// Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3] -// reg_p5 contains parameter n -//*********************************************************************** -.global pmuladd_asm -pmuladd_asm: - vmovdqu ymm5, PERM0246 - vmovdqu ymm6, MASK12x8 - xor rax, rax - movq r11, 4 -lazo2: - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a - vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b - vpmovsxdq ymm2, XMMWORD PTR [reg_p3+4*rax] // c - vpmuldq ymm0, ymm1, ymm0 - vpaddq ymm0, ymm2, ymm0 - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm6, ymm0 // c0 - vpsrlq ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm6, ymm0 // c0 - vpsrad ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 - - vpermd ymm0, ymm5, ymm0 - vmovdqu XMMWORD PTR [reg_p4+4*rax], xmm0 - - add rax, r11 // j+4 - cmp rax, reg_p5 - jl lazo2 - ret - - -//*********************************************************************** -// Component-wise multiplication -// Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2] -// reg_p4 contains parameter n -//*********************************************************************** -.global pmul_asm -pmul_asm: - vmovdqu ymm5, PERM0246 - vmovdqu ymm6, MASK12x8 - xor rax, rax - movq r11, 4 -lazo3: - vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a - vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b - vpmuldq ymm0, ymm1, ymm0 - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm6, ymm0 // c0 - vpsrlq ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm6, ymm0 // c0 - vpsrad ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 - - vpermd ymm0, ymm5, ymm0 - vmovdqu XMMWORD PTR [reg_p3+4*rax], xmm0 - - add rax, r11 // j+4 - cmp rax, reg_p4 - jl lazo3 - ret - - -//*********************************************************************** -// Two consecutive reductions -// Operation: c [reg_p1] <- a [reg_p1] -// reg_p2 contains parameter n -//*********************************************************************** -.global two_reduce12289_asm -two_reduce12289_asm: - vmovdqu ymm6, MASK12x8 - vmovdqu ymm7, PRIME8x - xor rax, rax - movq r11, 8 -lazo4: - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm6, ymm0 // c0 - vpsrad ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 - - vmovdqu ymm3, ymm0 - vpand ymm0, ymm6, ymm0 // c0 - vpsrad ymm3, ymm3, 12 // c1 - vpslld ymm4, ymm0, 1 // 2*c0 - vpsubd ymm3, ymm0, ymm3 // c0-c1 - vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 - - vpsrad ymm2, ymm0, 31 - vpand ymm2, ymm7, ymm2 - vpaddd ymm2, ymm0, ymm2 - vpsubd ymm0, ymm2, ymm7 - - vpsrad ymm2, ymm0, 31 - vpand ymm2, ymm7, ymm2 - vpaddd ymm0, ymm0, ymm2 - - vmovdqu YMMWORD PTR [reg_p1+4*rax], ymm0 - - add rax, r11 // j+8 - cmp rax, reg_p2 - jl lazo4 - ret - - -//*********************************************************************** -// Encoding -// Operation: c [reg_p2] <- a [reg_p1] -//*********************************************************************** -.global encode_asm -encode_asm: - vmovdqu ymm6, MASK32 - vmovdqu ymm7, MASK42 - mov r9, 1024 - xor rax, rax - xor r10, r10 - mov r11, 14 - mov rcx, 8 -lazo5: - vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a - - vpsrlq ymm1, ymm0, 18 - vpsllq ymm2, ymm0, 4 - vpand ymm0, ymm0, ymm6 - vpsrldq ymm2, ymm2, 5 - vpsrlq ymm3, ymm1, 4 - vpand ymm1, ymm1, ymm6 - vpand ymm2, ymm2, ymm7 - vpsrldq ymm3, ymm3, 4 - vpor ymm0, ymm0, ymm1 - vpor ymm0, ymm0, ymm2 - vpor ymm0, ymm0, ymm3 - vpermq ymm1, ymm0, 0x0e - - vmovdqu XMMWORD PTR [reg_p2+r10], xmm0 - vmovdqu XMMWORD PTR [reg_p2+r10+7], xmm1 - - add r10, r11 - add rax, rcx // j+8 - cmp rax, r9 - jl lazo5 - ret - - -//*********************************************************************** -// Decoding -// Operation: c [reg_p2] <- a [reg_p1] -//*********************************************************************** -.global decode_asm -decode_asm: - vmovdqu ymm6, MASK14_1 - vmovdqu ymm7, MASK14_2 - vmovdqu ymm8, MASK14_3 - vmovdqu ymm9, MASK14_4 - mov r9, 1024 - xor rax, rax - xor r10, r10 - mov r11, 14 - mov rcx, 8 -lazo6: - vmovdqu xmm0, XMMWORD PTR [reg_p1+r10] - vmovdqu xmm1, XMMWORD PTR [reg_p1+r10+7] - vinserti128 ymm0, ymm0, xmm1, 1 - - vpand ymm1, ymm0, ymm6 - vpand ymm2, ymm0, ymm7 - vpand ymm3, ymm0, ymm8 - vpand ymm4, ymm0, ymm9 - - vpsllq ymm2, ymm2, 18 - vpsllq ymm3, ymm3, 4 - vpslldq ymm3, ymm3, 4 - vpsrlq ymm4, ymm4, 2 - vpslldq ymm4, ymm4, 7 - - vpor ymm1, ymm1, ymm2 - vpor ymm1, ymm1, ymm3 - vpor ymm1, ymm1, ymm4 - - vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm1 - - add r10, r11 - add rax, rcx // j+8 - cmp rax, r9 - jl lazo6 +//**************************************************************************************** +// LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// Abstract: NTT functions in x64 assembly using AVX2 vector instructions for Linux +// +//**************************************************************************************** + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx +#define reg_p4 rcx +#define reg_p5 r8 + + +.text +//*********************************************************************** +// Forward NTT +// Operation: a [reg_p1] <- NTT(a) [reg_p1], +// [reg_p2] points to table and +// reg_p3 contains parameter n +//*********************************************************************** +.global NTT_CT_std2rev_12289_asm +NTT_CT_std2rev_12289_asm: + push r12 + push r13 + push r14 + +// Stages m=1 -> m=32 + mov r9, 1 // m = 1 + mov rax, reg_p3 + mov r12, reg_p3 + shr r12, 4 // n/16 + vmovdqu ymm14, MASK12x8 + vmovdqu ymm12, PERM0246 + mov r14, 16 + mov rcx, 11 +loop1: + shr rax, 1 // k = k/2 + dec rcx + xor rdx, rdx // i = 0 +loop2: + mov r10, rdx + mov r11, rax + dec r11 + shl r10, cl // j1 + add r11, r10 // j2 + mov r13, r9 + add r13, rdx // m+i + vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13] // S + +loop3: + mov r13, r10 + add r13, rax // j+k + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r13] // a[j+k] + vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k] + vpmovsxdq ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k] + vpmovsxdq ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k] + + vpmuldq ymm1, ymm1, ymm11 // a[j+k].S + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm13 // a[j] = U + V + vpermd ymm1, ymm12, ymm1 + vpermd ymm0, ymm12, ymm0 + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm3, ymm2, ymm13 // a[j+k] = U - V + vpaddd ymm2, ymm2, ymm13 // a[j] = U + V + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 + vpermd ymm3, ymm12, ymm3 + vpermd ymm2, ymm12, ymm2 + vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] + + vmovdqu ymm13, ymm5 + vpand ymm5, ymm14, ymm5 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm5, 1 // 2*c0 + vpsubd ymm13, ymm5, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm5, ymm4, ymm13 // a[j+k] = U - V + vpaddd ymm4, ymm4, ymm13 // a[j] = U + V + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 + vpermd ymm5, ymm12, ymm5 + vpermd ymm4, ymm12, ymm4 + vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] + + vmovdqu ymm13, ymm7 + vpand ymm7, ymm14, ymm7 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm7, 1 // 2*c0 + vpsubd ymm13, ymm7, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + vpsubd ymm7, ymm6, ymm13 // a[j+k] = U - V + vpaddd ymm6, ymm6, ymm13 // a[j] = U + V + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 + vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 + vpermd ymm6, ymm12, ymm6 + vpermd ymm7, ymm12, ymm7 + vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 + + add r10, r14 + cmp r10, r11 + jl loop3 + inc rdx + cmp rdx, r9 + jl loop2 + shl r9, 1 + cmp r9, r12 + jl loop1 + +// Stage m=64 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 +loop4: + vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k] + vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + vpmuldq ymm1, ymm1, ymm11 // a[j+k].S + vpmuldq ymm3, ymm3, ymm11 // a[j+k].S + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 + + vmovdqu ymm10, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm10, ymm10, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm10, ymm3, ymm10 // c0-c1 + vpaddd ymm10, ymm10, ymm15 // V = 3*c0-c1 + + vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm13 // a[j] = U + V + vpsubd ymm3, ymm2, ymm10 // a[j+k] = U - V + vpaddd ymm2, ymm2, ymm10 // a[j] = U + V + + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vpermd ymm2, ymm12, ymm2 + vpermd ymm3, ymm12, ymm3 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 + + add r10, r14 // j+16 + inc rdx // i+1 + cmp rdx, r9 + jl loop4 + +// Stage m=128 + shl r9, 1 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 + mov r13, 8 +loop6: + vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmuldq ymm1, ymm1, ymm2 // a[j+k].S + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm14, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // U = 3*c0-c1 + + vmovdqu ymm3, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm4, ymm3, 24 // c2 + vpsrad ymm3, ymm3, 12 // xc1 + vpand ymm3, ymm14, ymm3 // c1 + vpslld ymm5, ymm1, 3 // 8*c0 + vpaddd ymm4, ymm1, ymm4 // c0+c2 + vpaddd ymm4, ymm4, ymm5 // 9*c0+c2 + vpslld ymm5, ymm3, 1 // 2*c1 + vpaddd ymm1, ymm0, ymm3 // U+c1 + vpsubd ymm0, ymm0, ymm3 // U-c1 + vpsubd ymm4, ymm4, ymm5 // 9*c0-2*c1+c2 + vpaddd ymm0, ymm0, ymm4 // U+(9*c0-3*c1+c2) + vpsubd ymm1, ymm1, ymm4 // U-(9*c0-3*c1+c2) + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 + + add r10, r13 // j+8 + inc rdx // i+1 + cmp rdx, r9 + jl loop6 + +// Stage m=256 + vmovdqu ymm9, PERM02134657 + shl r9, 1 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 32 +loop7: + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256] // S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + vpermq ymm8, ymm2, 0xfa + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 + + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16] // S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 + + vpermq ymm8, ymm2, 0xfa + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+3] + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3] + vpermq ymm3, ymm0, 0x4e + vinserti128 ymm0, ymm0, xmm1, 1 // U + vpblendd ymm1, ymm1, ymm3, 15 + vpmuldq ymm3, ymm1, ymm8 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 + + add r10, r14 // j+32 + add rdx, r13 // i+8 + cmp rdx, r9 + jl loop7 + +// Stage m=512 + vmovdqu ymm9, PERM00224466 + shl r9, 1 // m = n/2 + xor rdx, rdx // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 4 +loop8: + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // a[j+k] + vpmuldq ymm3, ymm1, ymm2 // a[j+k].S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 + vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V + vpaddd ymm0, ymm0, ymm4 // a[j] = U + V + vpermd ymm1, ymm9, ymm1 + vpblendd ymm0, ymm0, ymm1, 0xaa + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + add r10, r13 // j+8 + add rdx, r14 // i+4 + cmp rdx, r9 + jl loop8 + + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Inverse NTT +// Operation: a [reg_p1] <- INTT(a) [reg_p1], +// [reg_p2] points to table +// reg_p3 and reg_p4 point to constants for scaling and +// reg_p5 contains parameter n +//*********************************************************************** +.global INTT_GS_rev2std_12289_asm +INTT_GS_rev2std_12289_asm: + push r12 + push r13 + push r14 + push r15 + push rbx + +// Stage m=1024 + vmovdqu ymm9, PERM00224466 + vmovdqu ymm14, MASK12x8 + mov r12, reg_p5 + shr r12, 1 // n/2 = 512 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 + mov r13, 8 + mov r14, 4 +loop1b: + vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // V = a[j+k] + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*512] // S + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm2 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpermd ymm1, ymm9, ymm1 + vpblendd ymm0, ymm0, ymm1, 0xaa + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + add r10, r13 // j+8 + add r15, r14 // i+4 + cmp r15, r12 + jl loop1b + +// Stage m=512 + vmovdqu ymm9, PERM02134657 + vmovdqu ymm13, PERM0145 + vmovdqu ymm15, PERM2367 + shr r12, 1 // n/4 = 256 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 32 +loop2b: + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256] // S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 + + vpermq ymm8, ymm2, 0xfa + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 + + vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] + vpermq ymm8, ymm2, 0x50 + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 + + vpermq ymm8, ymm2, 0xfa + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+7] + vpermd ymm1, ymm15, ymm0 + vpermd ymm0, ymm13, ymm0 + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm8 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpslldq ymm1, ymm1, 4 + vpblendd ymm0, ymm0, ymm1, 0xaa + vpermd ymm0, ymm9, ymm0 + vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 + + add r10, r14 // j+32 + add r15, r13 // i+8 + cmp r15, r12 + jl loop2b + +// Stage m=256 + vmovdqu ymm12, PERM0246 + shr r12, 1 // n/8 = 128 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 +loop3b: + vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128] // S + vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpsubd ymm3, ymm0, ymm1 // U - V + vpaddd ymm0, ymm0, ymm1 // U + V + vpmuldq ymm3, ymm3, ymm2 // (U - V).S + vmovdqu ymm4, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm4, ymm4, 12 // c1 + vpslld ymm5, ymm3, 1 // 2*c0 + vpsubd ymm4, ymm3, ymm4 // c0-c1 + vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 + + add r10, r13 // j+8 + inc r15 // i+1 + cmp r15, r12 + jl loop3b + +// Stage m=128 + shr r12, 1 // n/16 = 64 + xor r15, r15 // i = 0 + xor r10, r10 // j1 = 0 + mov r14, 16 +loop4b: + vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64] // S + vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+32] // V = a[j+k] + vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r10+48] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + vpsubd ymm1, ymm0, ymm13 // U - V + vpaddd ymm0, ymm0, ymm13 // U + V + vpsubd ymm3, ymm2, ymm15 // U - V + vpaddd ymm2, ymm2, ymm15 // U + V + vpmuldq ymm1, ymm1, ymm11 // (U - V).S + vpmuldq ymm3, ymm3, ymm11 // (U - V).S + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 + + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vpermd ymm2, ymm12, ymm2 + vpermd ymm3, ymm12, ymm3 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 + + add r10, r14 // j+16 + inc r15 // i+1 + cmp r15, r12 + jl loop4b + +// Stages m=64 -> m=4 + mov r9, 5 // 5 iterations + mov rax, 8 +loop5b: + shl rax, 1 // k = 2*k + shr r12, 1 // m/2 + xor r15, r15 // i = 0 + xor r8, r8 +loop6b: + mov r10, r8 // Load j1 + mov r11, rax + dec r11 + add r11, r10 // j2 + mov r13, r12 + add r13, r15 // m/2+i + vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13] // S + mov rbx, 4 + +loop7b: + mov r13, r10 + add r13, rax // j+k + vpmovsxdq ymm10, XMMWORD PTR [reg_p1+4*r13] // V = a[j+k] + vpmovsxdq ymm11, XMMWORD PTR [reg_p1+4*r13+16] // V = a[j+k] + vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r13+32] // V = a[j+k] + vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r13+48] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] + vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] + vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] + + vpsubd ymm1, ymm0, ymm10 // U - V + vpaddd ymm0, ymm0, ymm10 // U + V + vpsubd ymm3, ymm2, ymm11 // U - V + vpaddd ymm2, ymm2, ymm11 // U + V + vpsubd ymm5, ymm4, ymm13 // U - V + vpaddd ymm4, ymm4, ymm13 // U + V + vpsubd ymm7, ymm6, ymm15 // U - V + vpaddd ymm6, ymm6, ymm15 // U + V + + vpmuldq ymm1, ymm1, ymm9 // (U - V).S + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm5, ymm5, ymm9 + vpmuldq ymm7, ymm7, ymm9 + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 + + cmp r9, rbx + jne skip1 + vmovdqu ymm13, ymm0 + vpand ymm0, ymm14, ymm0 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm0, 1 // 2*c0 + vpsubd ymm13, ymm0, ymm13 // c0-c1 + vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 +skip1: + vpermd ymm1, ymm12, ymm1 + vpermd ymm0, ymm12, ymm0 + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 + + cmp r9, rbx + jne skip2 + vmovdqu ymm13, ymm2 + vpand ymm2, ymm14, ymm2 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm2, 1 // 2*c0 + vpsubd ymm13, ymm2, ymm13 // c0-c1 + vpaddd ymm2, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm3 + vpand ymm3, ymm14, ymm3 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm3, 1 // 2*c0 + vpsubd ymm13, ymm3, ymm13 // c0-c1 + vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 +skip2: + vpermd ymm3, ymm12, ymm3 + vpermd ymm2, ymm12, ymm2 + + vmovdqu ymm13, ymm5 + vpand ymm5, ymm14, ymm5 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm5, 1 // 2*c0 + vpsubd ymm13, ymm5, ymm13 // c0-c1 + vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 + vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 + + cmp r9, rbx + jne skip3 + vmovdqu ymm13, ymm4 + vpand ymm4, ymm14, ymm4 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm4, 1 // 2*c0 + vpsubd ymm13, ymm4, ymm13 // c0-c1 + vpaddd ymm4, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm5 + vpand ymm5, ymm14, ymm5 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm5, 1 // 2*c0 + vpsubd ymm13, ymm5, ymm13 // c0-c1 + vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 +skip3: + vpermd ymm5, ymm12, ymm5 + vpermd ymm4, ymm12, ymm4 + + vmovdqu ymm13, ymm7 + vpand ymm7, ymm14, ymm7 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm7, 1 // 2*c0 + vpsubd ymm13, ymm7, ymm13 // c0-c1 + vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 + vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 + vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 + + cmp r9, rbx + jne skip4 + vmovdqu ymm13, ymm6 + vpand ymm6, ymm14, ymm6 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm6, 1 // 2*c0 + vpsubd ymm13, ymm6, ymm13 // c0-c1 + vpaddd ymm6, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm7 + vpand ymm7, ymm14, ymm7 // c0 + vpsrad ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm7, 1 // 2*c0 + vpsubd ymm13, ymm7, ymm13 // c0-c1 + vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 +skip4: + vpermd ymm7, ymm12, ymm7 + vpermd ymm6, ymm12, ymm6 + vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 + vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 + + add r10, r14 + cmp r10, r11 + jl loop7b + mov rbx, rax + shl rbx, 1 // 2*k + add r8, rbx // j1+2*k + inc r15 + cmp r15, r12 + jl loop6b + dec r9 + jnz loop5b + +// Scaling step + shl rax, 1 // k = 2*k = 512 + xor r10, r10 // j = 0 + mov r14, 4 + movq xmm0, reg_p3 + vbroadcastsd ymm10, xmm0 // S = omegainv1N_rev + movq xmm0, reg_p4 + vbroadcastsd ymm11, xmm0 // T = Ninv +loop8b: + vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+4*512] // V = a[j+k] + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] + vpsubd ymm1, ymm0, ymm13 // U - V + vpaddd ymm0, ymm0, ymm13 // U + V + vpmuldq ymm1, ymm1, ymm10 // (U - V).S + vpmuldq ymm0, ymm0, ymm11 // (U + V).T + + vmovdqu ymm13, ymm0 + vpand ymm0, ymm14, ymm0 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm0, 1 // 2*c0 + vpsubd ymm13, ymm0, ymm13 // c0-c1 + vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 + + vmovdqu ymm13, ymm1 + vpand ymm1, ymm14, ymm1 // c0 + vpsrlq ymm13, ymm13, 12 // c1 + vpslld ymm15, ymm1, 1 // 2*c0 + vpsubd ymm13, ymm1, ymm13 // c0-c1 + vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 + + vpermd ymm0, ymm12, ymm0 + vpermd ymm1, ymm12, ymm1 + vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 + vmovdqu XMMWORD PTR [reg_p1+4*r10+4*512], xmm1 + + add r10, r14 // j+4 + cmp r10, rax + jl loop8b +loop9b: + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Component-wise multiplication and addition +// Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3] +// reg_p5 contains parameter n +//*********************************************************************** +.global pmuladd_asm +pmuladd_asm: + vmovdqu ymm5, PERM0246 + vmovdqu ymm6, MASK12x8 + xor rax, rax + movq r11, 4 +lazo2: + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a + vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b + vpmovsxdq ymm2, XMMWORD PTR [reg_p3+4*rax] // c + vpmuldq ymm0, ymm1, ymm0 + vpaddq ymm0, ymm2, ymm0 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrlq ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vpermd ymm0, ymm5, ymm0 + vmovdqu XMMWORD PTR [reg_p4+4*rax], xmm0 + + add rax, r11 // j+4 + cmp rax, reg_p5 + jl lazo2 + ret + + +//*********************************************************************** +// Component-wise multiplication +// Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2] +// reg_p4 contains parameter n +//*********************************************************************** +.global pmul_asm +pmul_asm: + vmovdqu ymm5, PERM0246 + vmovdqu ymm6, MASK12x8 + xor rax, rax + movq r11, 4 +lazo3: + vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a + vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b + vpmuldq ymm0, ymm1, ymm0 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrlq ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vpermd ymm0, ymm5, ymm0 + vmovdqu XMMWORD PTR [reg_p3+4*rax], xmm0 + + add rax, r11 // j+4 + cmp rax, reg_p4 + jl lazo3 + ret + + +//*********************************************************************** +// Two consecutive reductions +// Operation: c [reg_p1] <- a [reg_p1] +// reg_p2 contains parameter n +//*********************************************************************** +.global two_reduce12289_asm +two_reduce12289_asm: + vmovdqu ymm6, MASK12x8 + vmovdqu ymm7, PRIME8x + xor rax, rax + movq r11, 8 +lazo4: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vmovdqu ymm3, ymm0 + vpand ymm0, ymm6, ymm0 // c0 + vpsrad ymm3, ymm3, 12 // c1 + vpslld ymm4, ymm0, 1 // 2*c0 + vpsubd ymm3, ymm0, ymm3 // c0-c1 + vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 + + vpsrad ymm2, ymm0, 31 + vpand ymm2, ymm7, ymm2 + vpaddd ymm2, ymm0, ymm2 + vpsubd ymm0, ymm2, ymm7 + + vpsrad ymm2, ymm0, 31 + vpand ymm2, ymm7, ymm2 + vpaddd ymm0, ymm0, ymm2 + + vmovdqu YMMWORD PTR [reg_p1+4*rax], ymm0 + + add rax, r11 // j+8 + cmp rax, reg_p2 + jl lazo4 + ret + + +//*********************************************************************** +// Encoding +// Operation: c [reg_p2] <- a [reg_p1] +//*********************************************************************** +.global encode_asm +encode_asm: + vmovdqu ymm6, MASK32 + vmovdqu ymm7, MASK42 + mov r9, 1024 + xor rax, rax + xor r10, r10 + mov r11, 14 + mov rcx, 8 +lazo5: + vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a + + vpsrlq ymm1, ymm0, 18 + vpsllq ymm2, ymm0, 4 + vpand ymm0, ymm0, ymm6 + vpsrldq ymm2, ymm2, 5 + vpsrlq ymm3, ymm1, 4 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm7 + vpsrldq ymm3, ymm3, 4 + vpor ymm0, ymm0, ymm1 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpermq ymm1, ymm0, 0x0e + + vmovdqu XMMWORD PTR [reg_p2+r10], xmm0 + vmovdqu XMMWORD PTR [reg_p2+r10+7], xmm1 + + add r10, r11 + add rax, rcx // j+8 + cmp rax, r9 + jl lazo5 + ret + + +//*********************************************************************** +// Decoding +// Operation: c [reg_p2] <- a [reg_p1] +//*********************************************************************** +.global decode_asm +decode_asm: + vmovdqu ymm6, MASK14_1 + vmovdqu ymm7, MASK14_2 + vmovdqu ymm8, MASK14_3 + vmovdqu ymm9, MASK14_4 + mov r9, 1024 + xor rax, rax + xor r10, r10 + mov r11, 14 + mov rcx, 8 +lazo6: + vmovdqu xmm0, XMMWORD PTR [reg_p1+r10] + vmovdqu xmm1, XMMWORD PTR [reg_p1+r10+7] + vinserti128 ymm0, ymm0, xmm1, 1 + + vpand ymm1, ymm0, ymm6 + vpand ymm2, ymm0, ymm7 + vpand ymm3, ymm0, ymm8 + vpand ymm4, ymm0, ymm9 + + vpsllq ymm2, ymm2, 18 + vpsllq ymm3, ymm3, 4 + vpslldq ymm3, ymm3, 4 + vpsrlq ymm4, ymm4, 2 + vpslldq ymm4, ymm4, 7 + + vpor ymm1, ymm1, ymm2 + vpor ymm1, ymm1, ymm3 + vpor ymm1, ymm1, ymm4 + + vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm1 + + add r10, r11 + add rax, rcx // j+8 + cmp rax, r9 + jl lazo6 ret \ No newline at end of file diff --git a/dap-sdk/crypto/src/msrln/kex.c b/dap-sdk/crypto/src/msrln/kex.c index e2c6b317ec..99a8db3962 100755 --- a/dap-sdk/crypto/src/msrln/kex.c +++ b/dap-sdk/crypto/src/msrln/kex.c @@ -1,645 +1,642 @@ -#include "msrln_priv.h" -#if (OS_TARGET == OS_MACOS) - #include <stdio.h> -#else - #include <malloc.h> -#endif - -#include "KeccakHash.h" -#include "SimpleFIPS202.h" - - -// N^-1 * prime_scale^-8 -const int32_t MSRLN_Ninv8_ntt1024_12289 = 8350; -// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1] -const int32_t MSRLN_omegainv7N_rev_ntt1024_12289 = 795; -// N^-1 * prime_scale^-11 -const int32_t MSRLN_Ninv11_ntt1024_12289 = 2585; -// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1] -const int32_t MSRLN_omegainv10N_rev_ntt1024_12289 = 10953; - - -// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy), -// where xxx is parameter N and yyy is the prime q. - -const int32_t MSRLN_psi_rev_ntt1024_12289[1024] = { -8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, -875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, -7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, -3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, -3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, -7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, -5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, -8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, -11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, -11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969, -8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863, -8974, 9551, 5868, 9634, 5735,11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333, -10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455,7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656, -64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449, -4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033, -8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912, -1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029, -6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105, -8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993, -3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763, -125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187, -3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063, -4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377, -4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170, -10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982, -6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550, -8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120, -6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769, -4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000, -5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789, -10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946, -6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578, -7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259, -3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790, -5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253, -4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939 -}; - - -const int32_t MSRLN_omegainv_rev_ntt1024_12289[1024] = { -8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, -6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, -2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, -8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, -10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, -5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, -145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, -6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, -10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404, -1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162, -12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643, -6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018, -1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239, -11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257, -3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919, -8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483, -6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447, -6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036, -1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038, -6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120, -2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626, -11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015, -10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570, -6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273, -8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767, -8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468, -3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906, -6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492, -8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635, -9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392, -5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520, -3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566, -5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678, -4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873, -2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448 -}; - - -const int32_t MSRLN_psi_rev_ntt512_12289[512] = { -8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607, -4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390, -11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437, -12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, -1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, -2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, -1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, -10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, -7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517, -4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757, -9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297, -6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633, -12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540, -5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367, -8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369, -9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600, -3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184 -}; - - -const int32_t MSRLN_omegainv_rev_ntt512_12289[512] = { -8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302, -8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752, -12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186, -11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247, -9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821, -11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, -4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, -2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, -11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624, -9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169, -3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035, -3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752, -4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656, -2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404, -325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975, -10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908, -11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434 -}; - -// import external code -#ifdef RLWE_ASM_AVX2 - #include "AMD64/consts.c" - #include "AMD64/ntt_x64.c" -#else - #include "generic/ntt.c" -#endif - -__inline void clear_words(void* mem, digit_t nwords) -{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. - // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. - unsigned int i; - volatile digit_t *v = mem; - - for (i = 0; i < nwords; i++) { - v[i] = 0; - } -} - - -CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction) -{ // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction. - - pLatticeCrypto->RandomBytesFunction = RandomBytesFunction; - pLatticeCrypto->ExtendableOutputFunction = ExtendableOutputFunction; - pLatticeCrypto->StreamOutputFunction = StreamOutputFunction; - - return CRYPTO_MSRLN_SUCCESS; -} - - -PLatticeCryptoStruct LatticeCrypto_allocate() -{ // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). - // Returns NULL on error. - PLatticeCryptoStruct LatticeCrypto = NULL; - - LatticeCrypto = (PLatticeCryptoStruct)calloc(1, sizeof(LatticeCryptoStruct)); - - if (LatticeCrypto == NULL) { - return NULL; - } - return LatticeCrypto; -} - - -const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status) -{ // Output error/success message for a given CRYPTO_STATUS - struct error_mapping { - unsigned int index; - char* string; - } mapping[CRYPTO_STATUS_TYPE_SIZE] = { - {CRYPTO_MSRLN_SUCCESS, CRYPTO_MSG_SUCCESS}, - {CRYPTO_MSRLN_ERROR, CRYPTO_MSG_ERROR}, - {CRYPTO_MSRLN_ERROR_DURING_TEST, CRYPTO_MSG_ERROR_DURING_TEST}, - {CRYPTO_MSRLN_ERROR_UNKNOWN, CRYPTO_MSG_ERROR_UNKNOWN}, - {CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, CRYPTO_MSG_ERROR_NOT_IMPLEMENTED}, - {CRYPTO_MSRLN_ERROR_NO_MEMORY, CRYPTO_MSG_ERROR_NO_MEMORY}, - {CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, CRYPTO_MSG_ERROR_INVALID_PARAMETER}, - {CRYPTO_MSRLN_ERROR_SHARED_KEY, CRYPTO_MSG_ERROR_SHARED_KEY}, - {CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS} - }; - - if (Status >= CRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) { - return "Unrecognized CRYPTO_STATUS"; - } else { - return mapping[Status].string; - } -}; - - -void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m) -{ // Alice's message encoding - unsigned int i = 0, j; - -#if defined(GENERIC_IMPLEMENTATION) - for (j = 0; j < 1024; j += 4) { - m[i] = (unsigned char)(pk[j] & 0xFF); - m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6)); - m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF); - m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4)); - m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF); - m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2)); - m[i+6] = (unsigned char)(pk[j+3] >> 6); - i += 7; - } - -#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - encode_asm(pk, m); - i = 1792; -#endif - - for (j = 0; j < 32; j++) { - m[i+j] = seed[j]; - } -} - - -void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed) -{ // Alice's message decoding - unsigned int i = 0, j; - -#if defined(GENERIC_IMPLEMENTATION) - for (j = 0; j < 1024; j += 4) { - pk[j] = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8)); - pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10)); - pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12)); - pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6)); - i += 7; - } - -#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - decode_asm(m, pk); - i = 1792; -#endif - - for (j = 0; j < 32; j++) { - seed[j] = m[i+j]; - } -} - - -void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m) -{ // Bob's message encoding - unsigned int i = 0, j; - -#if defined(GENERIC_IMPLEMENTATION) - for (j = 0; j < 1024; j += 4) { - m[i] = (unsigned char)(pk[j] & 0xFF); - m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6)); - m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF); - m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4)); - m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF); - m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2)); - m[i+6] = (unsigned char)(pk[j+3] >> 6); - i += 7; - } - -#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - encode_asm(pk, m); -#endif - - i = 0; - for (j = 0; j < 1024/4; j++) { - m[1792+j] = (unsigned char)(rvec[i] | (rvec[i+1] << 2) | (rvec[i+2] << 4) | (rvec[i+3] << 6)); - i += 4; - } -} - - -void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec) -{ // Bob's message decoding - unsigned int i = 0, j; - -#if defined(GENERIC_IMPLEMENTATION) - for (j = 0; j < 1024; j += 4) { - pk[j] = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8)); - pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10)); - pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12)); - pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6)); - i += 7; - } - -#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - decode_asm(m, pk); - i = 1792; -#endif - - i = 0; - for (j = 0; j < 1024/4; j++) { - rvec[i] = (uint32_t)(m[1792+j] & 0x03); - rvec[i+1] = (uint32_t)((m[1792+j] >> 2) & 0x03); - rvec[i+2] = (uint32_t)((m[1792+j] >> 4) & 0x03); - rvec[i+3] = (uint32_t)(m[1792+j] >> 6); - i += 4; - } -} - - -static __inline uint32_t Abs(int32_t value) -{ // Compute absolute value - uint32_t mask; - - mask = (uint32_t)(value >> 31); - return ((mask ^ value) - mask); -} - - -CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction) -{ // Reconciliation helper - (void)seed; (void)nonce; (void)StreamOutputFunction; - unsigned int i, j, norm; - unsigned char bit, random_bits[32]; - uint32_t v0[4], v1[4]; - - randombytes( random_bits, 32); - CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS; - -#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - helprec_asm(x, rvec, random_bits); -#else - - for (i = 0; i < 256; i++) { - bit = 1 & (random_bits[i >> 3] >> (i & 0x07)); - rvec[i] = (x[i] << 1) - bit; - rvec[i+256] = (x[i+256] << 1) - bit; - rvec[i+512] = (x[i+512] << 1) - bit; - rvec[i+768] = (x[i+768] << 1) - bit; - - norm = 0; - v0[0] = 4; v0[1] = 4; v0[2] = 4; v0[3] = 4; - v1[0] = 3; v1[1] = 3; v1[2] = 3; v1[3] = 3; - for (j = 0; j < 4; j++) { - v0[j] -= (rvec[i+256*j] - PARAMETER_Q4 ) >> 31; - v0[j] -= (rvec[i+256*j] - PARAMETER_3Q4) >> 31; - v0[j] -= (rvec[i+256*j] - PARAMETER_5Q4) >> 31; - v0[j] -= (rvec[i+256*j] - PARAMETER_7Q4) >> 31; - v1[j] -= (rvec[i+256*j] - PARAMETER_Q2 ) >> 31; - v1[j] -= (rvec[i+256*j] - PARAMETER_Q ) >> 31; - v1[j] -= (rvec[i+256*j] - PARAMETER_3Q2) >> 31; - norm += Abs(2*rvec[i+256*j] - PARAMETER_Q*v0[j]); - } - - norm = (uint32_t)((int32_t)(norm - PARAMETER_Q) >> 31); // If norm < q then norm = 0xff...ff, else norm = 0 - v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0]; - v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1]; - v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2]; - v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3]; - rvec[i] = (v0[0] - v0[3]) & 0x03; - rvec[i+256] = (v0[1] - v0[3]) & 0x03; - rvec[i+512] = (v0[2] - v0[3]) & 0x03; - rvec[i+768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03; - } -#endif - - return Status; -} - - -static __inline uint32_t LDDecode(int32_t* t) -{ // Low-density decoding - unsigned int i, norm = 0; - uint32_t mask1, mask2, value; - int32_t cneg = -8*PARAMETER_Q; - - for (i = 0; i < 4; i++) { - mask1 = t[i] >> 31; // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0 - mask2 = (4*PARAMETER_Q - (int32_t)Abs(t[i])) >> 31; // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff - - value = ((mask1 & (8*PARAMETER_Q ^ cneg)) ^ cneg); - norm += Abs(t[i] + (mask2 & value)); - } - - return ((8*PARAMETER_Q - norm) >> 31) ^ 1; // If norm < PARAMETER_Q then return 1, else return 0 -} - - -void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key) -{ // Reconciliation - -#if defined(GENERIC_IMPLEMENTATION) - unsigned int i; - uint32_t t[4]; - - for (i = 0; i < 32; i++) { - key[i] = 0; - } - for (i = 0; i < 256; i++) { - t[0] = 8*x[i] - (2*rvec[i] + rvec[i+768]) * PARAMETER_Q; - t[1] = 8*x[i+256] - (2*rvec[i+256] + rvec[i+768]) * PARAMETER_Q; - t[2] = 8*x[i+512] - (2*rvec[i+512] + rvec[i+768]) * PARAMETER_Q; - t[3] = 8*x[i+768] - (rvec[i+768]) * PARAMETER_Q; - - key[i >> 3] |= (unsigned char)LDDecode((int32_t*)t) << (i & 0x07); - } - -#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - rec_asm(x, rvec, key); -#endif -} - - -CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction) -{ // Error sampling - (void) seed; (void) nonce; (void) StreamOutputFunction; - unsigned char stream[3 * PARAMETER_N]; - uint32_t *pstream = (uint32_t *) &stream; - uint32_t acc1, acc2, temp; - uint8_t *pacc1 = (uint8_t *) &acc1, *pacc2 = (uint8_t *) &acc2; - unsigned int i, j; - - randombytes( stream, 3 * PARAMETER_N); - -#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) - error_sampling_asm(stream, e); -#else - for (i = 0; i < PARAMETER_N / 4; i++) { - acc1 = 0; - acc2 = 0; - for (j = 0; j < 8; j++) { - acc1 += (pstream[i] >> j) & 0x01010101; - acc2 += (pstream[i + PARAMETER_N / 4] >> j) & 0x01010101; - } - for (j = 0; j < 4; j++) { - temp = pstream[i + 2 * PARAMETER_N / 4] >> j; - acc1 += temp & 0x01010101; - acc2 += (temp >> 4) & 0x01010101; - } - e[2 * i] = pacc1[0] - pacc1[1]; - e[2 * i + 1] = pacc1[2] - pacc1[3]; - e[2 * i + PARAMETER_N / 2] = pacc2[0] - pacc2[1]; - e[2 * i + PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3]; - } -#endif - - return CRYPTO_MSRLN_SUCCESS; -} - - -CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction) -{ // Generation of parameter a - (void)ExtendableOutputFunction; - unsigned int pos = 0, ctr = 0; - uint16_t val; - unsigned int nblocks = 16; - uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init - //Keccak_HashInstance ks; - - uint64_t state[SHA3_STATESIZE] = {0}; - shake128_absorb(state, seed, SEED_BYTES); - shake128_squeezeblocks((unsigned char *) buf, nblocks, state); - - /*#ifdef _WIN32 - SHAKE128_InitAbsorb( &ks, seed, SEED_BYTES ); - KECCAK_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 ); - #else - Keccak_HashInitialize_SHAKE128(&ks); - Keccak_HashUpdate( &ks, seed, SEED_BYTES * 8 ); - Keccak_HashFinal( &ks, seed ); - Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 ); - //#endif - */ - while (ctr < PARAMETER_N) { - val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff; - if (val < PARAMETER_Q) { - a[ctr++] = val; - } - pos += 2; - if (pos > SHAKE128_RATE * nblocks - 2) { - nblocks = 1; - shake128_squeezeblocks((unsigned char *) buf, nblocks, state); -// Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 ); - pos = 0; - } - } - - return CRYPTO_MSRLN_SUCCESS; -} - - -CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto) -{ // Alice's key generation - // It produces a private key SecretKeyA and computes the public key PublicKeyA. - // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) - // the public key PublicKeyA that occupies 1824 bytes - // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). - uint32_t a[PARAMETER_N]; - int32_t e[PARAMETER_N]; - unsigned char seed[SEED_BYTES]; - unsigned char error_seed[ERROR_SEED_BYTES]; - CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN; - - Status = randombytes( seed, SEED_BYTES); - - if (Status != CRYPTO_MSRLN_SUCCESS) { - return Status; - } - - Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - - Status = get_error(SecretKeyA, error_seed, 0, pLatticeCrypto->StreamOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - NTT_CT_std2rev_12289(SecretKeyA, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); - NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); - smul(e, 3, PARAMETER_N); - - pmuladd((int32_t*)a, SecretKeyA, e, (int32_t*)a, PARAMETER_N); - correction((int32_t*)a, PARAMETER_Q, PARAMETER_N); - encode_A(a, seed, PublicKeyA); - -cleanup: - clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N)); - clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES)); - - return Status; -} - - -CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto) -{ // Bob's key generation and shared secret computation - // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes - // the shared secret SharedSecretB. - // Input: Alice's public key PublicKeyA that consists of 1824 bytes - // Outputs: the public key PublicKeyB that occupies 2048 bytes. - // the 256-bit shared secret SharedSecretB. - // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). - uint32_t pk_A[PARAMETER_N], a[PARAMETER_N], v[PARAMETER_N], r[PARAMETER_N]; - int32_t sk_B[PARAMETER_N], e[PARAMETER_N]; - unsigned char seed[SEED_BYTES], error_seed[ERROR_SEED_BYTES]; - CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN; - - decode_A(PublicKeyA, pk_A, seed); - - Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - - Status = get_error(sk_B, error_seed, 0, pLatticeCrypto->StreamOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - NTT_CT_std2rev_12289(sk_B, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); - NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); - smul(e, 3, PARAMETER_N); - - pmuladd((int32_t*)a, sk_B, e, (int32_t*)a, PARAMETER_N); - correction((int32_t*)a, PARAMETER_Q, PARAMETER_N); - - Status = get_error(e, error_seed, 2, pLatticeCrypto->StreamOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); - smul(e, 81, PARAMETER_N); - - pmuladd((int32_t*)pk_A, sk_B, e, (int32_t*)v, PARAMETER_N); - INTT_GS_rev2std_12289((int32_t*)v, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N); - two_reduce12289((int32_t*)v, PARAMETER_N); -#if defined(GENERIC_IMPLEMENTATION) - correction((int32_t*)v, PARAMETER_Q, PARAMETER_N); -#endif - - Status = HelpRec(v, r, error_seed, 3, pLatticeCrypto->StreamOutputFunction); - if (Status != CRYPTO_MSRLN_SUCCESS) { - goto cleanup; - } - Rec(v, r, SharedSecretB); - encode_B(a, r, PublicKeyB); - -cleanup: - clear_words((void*)sk_B, NBYTES_TO_NWORDS(4*PARAMETER_N)); - clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N)); - clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES)); - clear_words((void*)a, NBYTES_TO_NWORDS(4*PARAMETER_N)); - clear_words((void*)v, NBYTES_TO_NWORDS(4*PARAMETER_N)); - clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N)); - - return Status; -} - - -CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA) -{ // Alice's shared secret computation - // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. - // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes - // the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) - // Output: the 256-bit shared secret SharedSecretA. - // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). - uint32_t u[PARAMETER_N], r[PARAMETER_N]; - CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS; - - decode_B(PublicKeyB, u, r); - - pmul(SecretKeyA, (int32_t*)u, (int32_t*)u, PARAMETER_N); - INTT_GS_rev2std_12289((int32_t*)u, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N); - two_reduce12289((int32_t*)u, PARAMETER_N); -#if defined(GENERIC_IMPLEMENTATION) - correction((int32_t*)u, PARAMETER_Q, PARAMETER_N); -#endif - - Rec(u, r, SharedSecretA); - -// Cleanup - clear_words((void*)u, NBYTES_TO_NWORDS(4*PARAMETER_N)); - clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N)); - - return Status; -} +#include <stdio.h> +#include <stdlib.h> +#include "msrln_priv.h" + +#include "KeccakHash.h" +#include "SimpleFIPS202.h" + + +// N^-1 * prime_scale^-8 +const int32_t MSRLN_Ninv8_ntt1024_12289 = 8350; +// N^-1 * prime_scale^-7 * omegainv_rev_ntt1024_12289[1] +const int32_t MSRLN_omegainv7N_rev_ntt1024_12289 = 795; +// N^-1 * prime_scale^-11 +const int32_t MSRLN_Ninv11_ntt1024_12289 = 2585; +// N^-1 * prime_scale^-10 * omegainv_rev_ntt1024_12289[1] +const int32_t MSRLN_omegainv10N_rev_ntt1024_12289 = 10953; + + +// Index-reversed matrices containing powers of psi (psi_rev_nttxxx_yyy) and inverse powers of omega (omegainv_rev_nttxxx_yyy), +// where xxx is parameter N and yyy is the prime q. + +const int32_t MSRLN_psi_rev_ntt1024_12289[1024] = { +8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, +875, 3780, 1607, 4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, +7188, 1067, 2401, 11847, 390, 11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, +3636, 7351, 9585, 6998, 160, 3149, 4437, 12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, +3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, 1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, +7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, 2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, +5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, 1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, +8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, 10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, +11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, 7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, +11336, 3434, 3529, 2908, 12071, 2361, 1843, 3030, 8174, 6147, 9842, 8326, 576, 10335, 10238, 10484, 9407, 11836, 5908, 418, 3772, 7515, 5429, 7552, 10996, 12133, 2767, 3969, +8298, 6413, 10008, 2031, 5333, 10800, 9789, 10706, 5942, 1263, 49, 5915, 10806, 11939, 10777, 1815, 5383, 3202, 4493, 6920, 10232, 1975, 8532, 2925, 347, 4754, 1858, 11863, +8974, 9551, 5868, 9634, 5735,11566, 12115, 10596, 3009, 6190, 11994, 6523, 652, 3762, 9370, 4016, 4077, 8561, 4049, 5990, 11130, 11143, 948, 325, 1404, 6992, 6119, 8333, +10929, 1200, 5184, 2555, 6122, 1594, 10327, 7183, 5961, 2692, 12121, 4298, 3329, 5919, 4433, 8455,7032, 1747, 3123, 3054, 6803, 5782, 10723, 9341, 2503, 683, 2459, 3656, +64, 4240, 3570, 835, 6065, 4046, 11580, 10970, 3150, 10331, 4322, 2078, 1112, 4079, 11231, 441, 922, 1050, 4536, 6844, 8429, 2683, 11099, 3818, 6171, 8500, 12142, 6833, 4449, +4749, 6752, 7500, 7822, 8214, 6974, 7965, 7373, 2169, 522, 5079, 3262, 10316, 6715, 1278, 9945, 3514, 11248, 11271, 5925, 468, 3988, 382, 11973, 5339, 6843, 6196, 8579, 2033, +8291, 1922, 3879, 11035, 973, 6854, 10930, 5206, 6760, 3199, 56, 3565, 654, 1702, 10302, 5862, 6153, 5415, 8646, 11889, 10561, 7341, 6152, 7232, 4698, 8844, 4780, 10240, 4912, +1321, 12097, 7048, 2920, 3127, 4169, 11502, 3482, 11279, 5468, 5874, 11612, 6055, 8953, 52, 3174, 10966, 9523, 151, 2127, 3957, 2839, 9784, 6383, 1579, 431, 7507, 5886, 3029, +6695, 4213, 504, 11684, 2302, 8689, 9026, 4624, 6212, 11868, 4080, 6221, 8687, 1003, 8757, 241, 58, 5009, 10333, 885, 6281, 3438, 9445, 11314, 8077, 6608, 3477, 142, 1105, +8841, 343, 4538, 1908, 1208, 4727, 7078, 10423, 10125, 6873, 11573, 10179, 416, 814, 1705, 2450, 8700, 717, 9307, 1373, 8186, 2429, 10568, 10753, 7228, 11071, 438, 8774, 5993, +3278, 4209, 6877, 3449, 1136, 3708, 3238, 2926, 1826, 4489, 3171, 8024, 8611, 1928, 464, 3205, 8930, 7080, 1092, 10900, 10221, 11943, 4404, 9126, 4032, 7449, 6127, 8067, 10763, +125, 540, 8921, 8062, 612, 8051, 12229, 9572, 9089, 10754, 10029, 68, 6453, 7723, 4781, 4924, 1014, 448, 3942, 5232, 1327, 8682, 3744, 7326, 3056, 9761, 5845, 5588, 412, 7187, +3975, 4883, 3087, 6454, 2257, 7784, 5676, 1417, 8400, 11710, 5596, 5987, 9175, 2769, 5966, 212, 6555, 11113, 5508, 11014, 1125, 4860, 10844, 1131, 4267, 6636, 2275, 9828, 5063, +4176, 3765, 1518, 8794, 4564, 10224, 5826, 3534, 3961, 4145, 10533, 506, 11034, 6505, 10897, 2674, 10077, 3338, 9013, 3511, 6811, 11111, 2776, 1165, 2575, 8881, 10347, 377, +4578, 11914, 10669, 10104, 392, 10453, 425, 9489, 193, 2231, 6197, 1038, 11366, 6204, 8122, 2894, 3654, 10975, 10545, 6599, 2455, 11951, 3947, 20, 5002, 5163, 4608, 8946, 8170, +10138, 1522, 8665, 10397, 3344, 5598, 10964, 6565, 11260, 1945, 11041, 9847, 7174, 4939, 2148, 6330, 3959, 5797, 4913, 3528, 8054, 3825, 8914, 9998, 4335, 8896, 9342, 3982, +6680, 11653, 7790, 6617, 1737, 622, 10485, 10886, 6195, 7100, 1687, 406, 12143, 5268, 9389, 12050, 994, 7735, 5464, 7383, 4670, 512, 364, 9929, 3028, 5216, 5518, 1226, 7550, +8038, 7043, 7814, 11053, 3017, 3121, 7584, 2600, 11232, 6780, 12085, 5219, 1409, 9600, 4605, 8151, 12109, 463, 8882, 8308, 10821, 9247, 10945, 9806, 2054, 6203, 6643, 3120, +6105, 8348, 8536, 6919, 8753, 11007, 8717, 9457, 2021, 9060, 4730, 3929, 10583, 3723, 845, 1936, 7, 5054, 3154, 3285, 4360, 3805, 11522, 2213, 4153, 12239, 12073, 5526, 769, +4099, 3944, 5604, 5530, 11024, 9282, 2171, 3480, 7434, 8520, 3232, 11996, 9656, 1406, 2945, 5349, 7207, 4590, 11607, 11309, 5202, 844, 7082, 4050, 8016, 9068, 9694, 8452, 7000, +5662, 567, 2941, 8619, 3808, 4987, 2373, 5135, 63, 7605, 3360, 11839, 10345, 578, 6921, 7628, 510, 5386, 2622, 7806, 5703, 10783, 9224, 11379, 5900, 4719, 11538, 3502, 5789, +10631, 5618, 826, 5043, 3090, 10891, 9951, 7596, 2293, 11872, 6151, 3469, 4443, 8871, 1555, 1802, 5103, 1891, 1223, 2334, 7878, 1590, 881, 365, 1927, 11274, 4510, 9652, 2946, +6828, 1280, 614, 10918, 12265, 7250, 6742, 9804, 11385, 2276, 11307, 2593, 879, 7899, 8071, 3454, 8531, 3795, 9021, 5776, 1849, 7766, 7988, 457, 8, 530, 9663, 7785, 11511, 3578, +7592, 10588, 3466, 8972, 9757, 3332, 139, 2046, 2940, 10808, 9332, 874, 2301, 5650, 12119, 150, 648, 8000, 9982, 9416, 2827, 2434, 11498, 6481, 12268, 9754, 11169, 11823, 11259, +3821, 10608, 2929, 6263, 4649, 6320, 9687, 10388, 502, 5118, 8496, 6226, 10716, 8443, 7624, 6883, 9269, 6616, 8620, 5287, 944, 7519, 6125, 1882, 11249, 10254, 5410, 1251, 1790, +5275, 8449, 10447, 4113, 72, 2828, 4352, 7455, 2712, 11048, 7911, 3451, 4094, 6508, 3045, 11194, 2643, 1783, 7211, 4974, 7724, 9811, 9449, 3019, 4194, 2730, 6878, 10421, 2253, +4518, 9195, 7469, 11129, 9173, 12100, 1763, 2209, 9617, 5170, 865, 1279, 1694, 10759, 8420, 4423, 10555, 3815, 5832, 10939 +}; + + +const int32_t MSRLN_omegainv_rev_ntt1024_12289[1024] = { +8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, +6267, 9302, 8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, +2678, 8585, 10752, 12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, +8779, 1630, 10163, 5407, 3186, 11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, +10115, 2847, 4414, 9644, 4053, 7247, 9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, +5331, 8705, 4177, 9764, 10908, 11950, 9821, 11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, +145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, 4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, +6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, 2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, +10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, 11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 11184, 12147, 8812, 5681, 4212, 975, 2844, 8851, 6008, 11404, +1956, 7280, 12231, 12048, 3532, 11286, 3602, 6068, 8209, 421, 6077, 7665, 3263, 3600, 9987, 605, 11785, 8076, 5594, 9260, 6403, 4782, 11858, 10710, 5906, 2505, 9450, 8332, 10162, +12138, 2766, 1323, 9115, 12237, 3336, 6234, 677, 6415, 6821, 1010, 8807, 787, 8120, 9162, 9369, 5241, 192, 10968, 7377, 2049, 7509, 3445, 7591, 5057, 6137, 4948, 1728, 400, 3643, +6874, 6136, 6427, 1987, 10587, 11635, 8724, 12233, 9090, 5529, 7083, 1359, 5435, 11316, 1254, 8410, 10367, 3998, 10256, 3710, 6093, 5446, 6950, 316, 11907, 8301, 11821, 6364, 1018, +1041, 8775, 2344, 11011, 5574, 1973, 9027, 7210, 11767, 10120, 4916, 4324, 5315, 4075, 4467, 4789, 5537, 7540, 7840, 5456, 147, 3789, 6118, 8471, 1190, 9606, 3860, 5445, 7753, 11239, +11367, 11848, 1058, 8210, 11177, 10211, 7967, 1958, 9139, 1319, 709, 8243, 6224, 11454, 8719, 8049, 12225, 8633, 9830, 11606, 9786, 2948, 1566, 6507, 5486, 9235, 9166, 10542, 5257, +3834, 7856, 6370, 8960, 7991, 168, 9597, 6328, 5106, 1962, 10695, 6167, 9734, 7105, 11089, 1360, 3956, 6170, 5297, 10885, 11964, 11341, 1146, 1159, 6299, 8240, 3728, 8212, 8273, 2919, +8527, 11637, 5766, 295, 6099, 9280, 1693, 174, 723, 6554, 2655, 6421, 2738, 3315, 426, 10431, 7535, 11942, 9364, 3757, 10314, 2057, 5369, 7796, 9087, 6906, 10474, 1512, 350, 1483, +6374, 12240, 11026, 6347, 1583, 2500, 1489, 6956, 10258, 2281, 5876, 3991, 8320, 9522, 156, 1293, 4737, 6860, 4774, 8517, 11871, 6381, 453, 2882, 1805, 2051, 1954, 11713, 3963, 2447, +6142, 4115, 9259, 10446, 9928, 218, 9381, 8760, 8855, 1350, 6457, 8474, 1734, 7866, 3869, 1530, 10595, 11010, 11424, 7119, 2672, 10080, 10526, 189, 3116, 1160, 4820, 3094, 7771, 10036, +1868, 5411, 9559, 8095, 9270, 2840, 2478, 4565, 7315, 5078, 10506, 9646, 1095, 9244, 5781, 8195, 8838, 4378, 1241, 9577, 4834, 7937, 9461, 12217, 8176, 1842, 3840, 7014, 10499, 11038, +6879, 2035, 1040, 10407, 6164, 4770, 11345, 7002, 3669, 5673, 3020, 5406, 4665, 3846, 1573, 6063, 3793, 7171, 11787, 1901, 2602, 5969, 7640, 6026, 9360, 1681, 8468, 1030, 466, 1120, +2535, 21, 5808, 791, 9855, 9462, 2873, 2307, 4289, 11641, 12139, 170, 6639, 9988, 11415, 2957, 1481, 9349, 10243, 12150, 8957, 2532, 3317, 8823, 1701, 4697, 8711, 778, 4504, 2626, +11759, 12281, 11832, 4301, 4523, 10440, 6513, 3268, 8494, 3758, 8835, 4218, 4390, 11410, 9696, 982, 10013, 904, 2485, 5547, 5039, 24, 1371, 11675, 11009, 5461, 9343, 2637, 7779, 1015, +10362, 11924, 11408, 10699, 4411, 9955, 11066, 10398, 7186, 10487, 10734, 3418, 7846, 8820, 6138, 417, 9996, 4693, 2338, 1398, 9199, 7246, 11463, 6671, 1658, 6500, 8787, 751, 7570, +6389, 910, 3065, 1506, 6586, 4483, 9667, 6903, 11779, 4661, 5368, 11711, 1944, 450, 8929, 4684, 12226, 7154, 9916, 7302, 8481, 3670, 9348, 11722, 6627, 5289, 3837, 2595, 3221, 4273, +8239, 5207, 11445, 7087, 980, 682, 7699, 5082, 6940, 9344, 10883, 2633, 293, 9057, 3769, 4855, 8809, 10118, 3007, 1265, 6759, 6685, 8345, 8190, 11520, 6763, 216, 50, 8136, 10076, 767, +8484, 7929, 9004, 9135, 7235, 12282, 10353, 11444, 8566, 1706, 8360, 7559, 3229, 10268, 2832, 3572, 1282, 3536, 5370, 3753, 3941, 6184, 9169, 5646, 6086, 10235, 2483, 1344, 3042, 1468, +3981, 3407, 11826, 180, 4138, 7684, 2689, 10880, 7070, 204, 5509, 1057, 9689, 4705, 9168, 9272, 1236, 4475, 5246, 4251, 4739, 11063, 6771, 7073, 9261, 2360, 11925, 11777, 7619, 4906, +6825, 4554, 11295, 239, 2900, 7021, 146, 11883, 10602, 5189, 6094, 1403, 1804, 11667, 10552, 5672, 4499, 636, 5609, 8307, 2947, 3393, 7954, 2291, 3375, 8464, 4235, 8761, 7376, 6492, +8330, 5959, 10141, 7350, 5115, 2442, 1248, 10344, 1029, 5724, 1325, 6691, 8945, 1892, 3624, 10767, 2151, 4119, 3343, 7681, 7126, 7287, 12269, 8342, 338, 9834, 5690, 1744, 1314, 8635, +9395, 4167, 6085, 923, 11251, 6092, 10058, 12096, 2800, 11864, 1836, 11897, 2185, 1620, 375, 7711, 11912, 1942, 3408, 9714, 11124, 9513, 1178, 5478, 8778, 3276, 8951, 2212, 9615, 1392, +5784, 1255, 11783, 1756, 8144, 8328, 8755, 6463, 2065, 7725, 3495, 10771, 8524, 8113, 7226, 2461, 10014, 5653, 8022, 11158, 1445, 7429, 11164, 1275, 6781, 1176, 5734, 12077, 6323, 9520, +3114, 6302, 6693, 579, 3889, 10872, 6613, 4505, 10032, 5835, 9202, 7406, 8314, 5102, 11877, 6701, 6444, 2528, 9233, 4963, 8545, 3607, 10962, 7057, 8347, 11841, 11275, 7365, 7508, 4566, +5836, 12221, 2260, 1535, 3200, 2717, 60, 4238, 11677, 4227, 3368, 11749, 12164, 1526, 4222, 6162, 4840, 8257, 3163, 7885, 346, 2068, 1389, 11197, 5209, 3359, 9084, 11825, 10361, 3678, +4265, 9118, 7800, 10463, 9363, 9051, 8581, 11153, 8840, 5412, 8080, 9011, 6296, 3515, 11851, 1218, 5061, 1536, 1721, 9860, 4103, 10916, 2982, 11572, 3589, 9839, 10584, 11475, 11873, +2110, 716, 5416, 2164, 1866, 5211, 7562, 11081, 10381, 7751, 11946, 3448 +}; + + +const int32_t MSRLN_psi_rev_ntt512_12289[512] = { +8193, 493, 6845, 9908, 1378, 10377, 7952, 435, 10146, 1065, 404, 7644, 1207, 3248, 11121, 5277, 2437, 3646, 2987, 6022, 9867, 6250, 10102, 9723, 1002, 7278, 4284, 7201, 875, 3780, 1607, +4976, 8146, 4714, 242, 1537, 3704, 9611, 5019, 545, 5084, 10657, 4885, 11272, 3066, 12262, 3763, 10849, 2912, 5698, 11935, 4861, 7277, 9808, 11244, 2859, 7188, 1067, 2401, 11847, 390, +11516, 8511, 3833, 2780, 7094, 4895, 1484, 2305, 5042, 8236, 2645, 7875, 9442, 2174, 7917, 1689, 3364, 4057, 3271, 10863, 4654, 1777, 10626, 3636, 7351, 9585, 6998, 160, 3149, 4437, +12286, 10123, 3915, 7370, 12176, 4048, 2249, 2884, 1153, 9103, 6882, 2126, 10659, 3510, 5332, 2865, 9919, 9320, 8311, 9603, 9042, 3016, 12046, 9289, 11618, 7098, 3136, 9890, 3400, 2178, +1544, 5559, 420, 8304, 4905, 476, 3531, 9326, 4896, 9923, 3051, 3091, 81, 1000, 4320, 1177, 8034, 9521, 10654, 11563, 7678, 10436, 12149, 3014, 9088, 5086, 1326, 11119, 2319, 11334, 790, +2747, 7443, 3135, 3712, 1062, 9995, 7484, 8736, 9283, 2744, 11726, 2975, 9664, 949, 7468, 9650, 7266, 5828, 6561, 7698, 3328, 6512, 1351, 7311, 8155, 5736, 722, 10984, 4043, 7143, 10810, +1, 8668, 2545, 3504, 8747, 11077, 1646, 9094, 5860, 1759, 8582, 3694, 7110, 8907, 11934, 8058, 9741, 9558, 3932, 5911, 4890, 3637, 8830, 5542, 12144, 5755, 7657, 7901, 11029, 11955, 9863, +10861, 1696, 3284, 2881, 7197, 2089, 9000, 2013, 729, 9048, 11809, 2842, 11267, 9, 6498, 544, 2468, 339, 1381, 2525, 8112, 3584, 6958, 4989, 10616, 8011, 5374, 9452, 12159, 4354, 9893, +7837, 3296, 8340, 7222, 2197, 118, 2476, 5767, 827, 8541, 11336, 8855, 8760, 9381, 218, 9928, 10446, 9259, 4115, 6142, 2447, 3963, 11713, 1954, 2051, 1805, 2882, 453, 6381, 11871, 8517, +4774, 6860, 4737, 1293, 156, 9522, 8320, 3991, 5876, 2281, 10258, 6956, 1489, 2500, 1583, 6347, 11026, 12240, 6374, 1483, 350, 1512, 10474, 6906, 9087, 7796, 5369, 2057, 10314, 3757, +9364, 11942, 7535, 10431, 426, 3315, 2738, 6421, 2655, 6554, 723, 174, 1693, 9280, 6099, 295, 5766, 11637, 8527, 2919, 8273, 8212, 3728, 8240, 6299, 1159, 1146, 11341, 11964, 10885, 5297, +6170, 3956, 1360, 11089, 7105, 9734, 6167, 10695, 1962, 5106, 6328, 9597, 168, 7991, 8960, 6370, 7856, 3834, 5257, 10542, 9166, 9235, 5486, 6507, 1566, 2948, 9786, 11606, 9830, 8633, +12225, 8049, 8719, 11454, 6224, 8243, 709, 1319, 9139, 1958, 7967, 10211, 11177, 8210, 1058, 11848, 11367, 11239, 7753, 5445, 3860, 9606, 1190, 8471, 6118, 3789, 147, 5456, 7840, 7540, +5537, 4789, 4467, 4075, 5315, 4324, 4916, 10120, 11767, 7210, 9027, 1973, 5574, 11011, 2344, 8775, 1041, 1018, 6364, 11821, 8301, 11907, 316, 6950, 5446, 6093, 3710, 10256, 3998, 10367, +8410, 1254, 11316, 5435, 1359, 7083, 5529, 9090, 12233, 8724, 11635, 10587, 1987, 6427, 6136, 6874, 3643, 400, 1728, 4948, 6137, 5057, 7591, 3445, 7509, 2049, 7377, 10968, 192, 5241, 9369, +9162, 8120, 787, 8807, 1010, 6821, 6415, 677, 6234, 3336, 12237, 9115, 1323, 2766, 12138, 10162, 8332, 9450, 2505, 5906, 10710, 11858, 4782, 6403, 9260, 5594, 8076, 11785, 605, 9987, 3600, +3263, 7665, 6077, 421, 8209, 6068, 3602, 11286, 3532, 12048, 12231, 7280, 1956, 11404, 6008, 8851, 2844, 975, 4212, 5681, 8812, 12147, 11184 +}; + + +const int32_t MSRLN_omegainv_rev_ntt512_12289[512] = { +8193, 11796, 2381, 5444, 11854, 4337, 1912, 10911, 7012, 1168, 9041, 11082, 4645, 11885, 11224, 2143, 7313, 10682, 8509, 11414, 5088, 8005, 5011, 11287, 2566, 2187, 6039, 2422, 6267, 9302, +8643, 9852, 8456, 3778, 773, 11899, 442, 9888, 11222, 5101, 9430, 1045, 2481, 5012, 7428, 354, 6591, 9377, 1440, 8526, 27, 9223, 1017, 7404, 1632, 7205, 11744, 7270, 2678, 8585, 10752, +12047, 7575, 4143, 8758, 11813, 7384, 3985, 11869, 6730, 10745, 10111, 8889, 2399, 9153, 5191, 671, 3000, 243, 9273, 3247, 2686, 3978, 2969, 2370, 9424, 6957, 8779, 1630, 10163, 5407, 3186, +11136, 9405, 10040, 8241, 113, 4919, 8374, 2166, 3, 7852, 9140, 12129, 5291, 2704, 4938, 8653, 1663, 10512, 7635, 1426, 9018, 8232, 8925, 10600, 4372, 10115, 2847, 4414, 9644, 4053, 7247, +9984, 10805, 7394, 5195, 9509, 953, 3748, 11462, 6522, 9813, 12171, 10092, 5067, 3949, 8993, 4452, 2396, 7935, 130, 2837, 6915, 4278, 1673, 7300, 5331, 8705, 4177, 9764, 10908, 11950, 9821, +11745, 5791, 12280, 1022, 9447, 480, 3241, 11560, 10276, 3289, 10200, 5092, 9408, 9005, 10593, 1428, 2426, 334, 1260, 4388, 4632, 6534, 145, 6747, 3459, 8652, 7399, 6378, 8357, 2731, 2548, +4231, 355, 3382, 5179, 8595, 3707, 10530, 6429, 3195, 10643, 1212, 3542, 8785, 9744, 3621, 12288, 1479, 5146, 8246, 1305, 11567, 6553, 4134, 4978, 10938, 5777, 8961, 4591, 5728, 6461, 5023, +2639, 4821, 11340, 2625, 9314, 563, 9545, 3006, 3553, 4805, 2294, 11227, 8577, 9154, 4846, 9542, 11499, 955, 9970, 1170, 10963, 7203, 3201, 9275, 140, 1853, 4611, 726, 1635, 2768, 4255, +11112, 7969, 11289, 12208, 9198, 9238, 2366, 7393, 2963, 1105, 142, 3477, 6608, 8077, 11314, 9445, 3438, 6281, 885, 10333, 5009, 58, 241, 8757, 1003, 8687, 6221, 4080, 11868, 6212, 4624, +9026, 8689, 2302, 11684, 504, 4213, 6695, 3029, 5886, 7507, 431, 1579, 6383, 9784, 2839, 3957, 2127, 151, 9523, 10966, 3174, 52, 8953, 6055, 11612, 5874, 5468, 11279, 3482, 11502, 4169, +3127, 2920, 7048, 12097, 1321, 4912, 10240, 4780, 8844, 4698, 7232, 6152, 7341, 10561, 11889, 8646, 5415, 6153, 5862, 10302, 1702, 654, 3565, 56, 3199, 6760, 5206, 10930, 6854, 973, 11035, +3879, 1922, 8291, 2033, 8579, 6196, 6843, 5339, 11973, 382, 3988, 468, 5925, 11271, 11248, 3514, 9945, 1278, 6715, 10316, 3262, 5079, 522, 2169, 7373, 7965, 6974, 8214, 7822, 7500, 6752, +4749, 4449, 6833, 12142, 8500, 6171, 3818, 11099, 2683, 8429, 6844, 4536, 1050, 922, 441, 11231, 4079, 1112, 2078, 4322, 10331, 3150, 10970, 11580, 4046, 6065, 835, 3570, 4240, 64, 3656, +2459, 683, 2503, 9341, 10723, 5782, 6803, 3054, 3123, 1747, 7032, 8455, 4433, 5919, 3329, 4298, 12121, 2692, 5961, 7183, 10327, 1594, 6122, 2555, 5184, 1200, 10929, 8333, 6119, 6992, 1404, +325, 948, 11143, 11130, 5990, 4049, 8561, 4077, 4016, 9370, 3762, 652, 6523, 11994, 6190, 3009, 10596, 12115, 11566, 5735, 9634, 5868, 9551, 8974, 11863, 1858, 4754, 347, 2925, 8532, 1975, +10232, 6920, 4493, 3202, 5383, 1815, 10777, 11939, 10806, 5915, 49, 1263, 5942, 10706, 9789, 10800, 5333, 2031, 10008, 6413, 8298, 3969, 2767, 12133, 10996, 7552, 5429, 7515, 3772, 418, 5908, +11836, 9407, 10484, 10238, 10335, 576, 8326, 9842, 6147, 8174, 3030, 1843, 2361, 12071, 2908, 3529, 3434 +}; + +// import external code +#ifdef RLWE_ASM_AVX2 + #include "AMD64/consts.c" + #include "AMD64/ntt_x64.c" +#else + #include "generic/ntt.c" +#endif + +__inline void clear_words(void* mem, digit_t nwords) +{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. + // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. + unsigned int i; + volatile digit_t *v = mem; + + for (i = 0; i < nwords; i++) { + v[i] = 0; + } +} + + +CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction) +{ // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction. + + pLatticeCrypto->RandomBytesFunction = RandomBytesFunction; + pLatticeCrypto->ExtendableOutputFunction = ExtendableOutputFunction; + pLatticeCrypto->StreamOutputFunction = StreamOutputFunction; + + return CRYPTO_MSRLN_SUCCESS; +} + + +PLatticeCryptoStruct LatticeCrypto_allocate() +{ // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). + // Returns NULL on error. + PLatticeCryptoStruct LatticeCrypto = NULL; + + LatticeCrypto = (PLatticeCryptoStruct)calloc(1, sizeof(LatticeCryptoStruct)); + + if (LatticeCrypto == NULL) { + return NULL; + } + return LatticeCrypto; +} + + +const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status) +{ // Output error/success message for a given CRYPTO_STATUS + struct error_mapping { + unsigned int index; + char* string; + } mapping[CRYPTO_STATUS_TYPE_SIZE] = { + {CRYPTO_MSRLN_SUCCESS, CRYPTO_MSG_SUCCESS}, + {CRYPTO_MSRLN_ERROR, CRYPTO_MSG_ERROR}, + {CRYPTO_MSRLN_ERROR_DURING_TEST, CRYPTO_MSG_ERROR_DURING_TEST}, + {CRYPTO_MSRLN_ERROR_UNKNOWN, CRYPTO_MSG_ERROR_UNKNOWN}, + {CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, CRYPTO_MSG_ERROR_NOT_IMPLEMENTED}, + {CRYPTO_MSRLN_ERROR_NO_MEMORY, CRYPTO_MSG_ERROR_NO_MEMORY}, + {CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, CRYPTO_MSG_ERROR_INVALID_PARAMETER}, + {CRYPTO_MSRLN_ERROR_SHARED_KEY, CRYPTO_MSG_ERROR_SHARED_KEY}, + {CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS} + }; + + if (Status >= CRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) { + return "Unrecognized CRYPTO_STATUS"; + } else { + return mapping[Status].string; + } +}; + + +void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m) +{ // Alice's message encoding + unsigned int i = 0, j; + +#if defined(GENERIC_IMPLEMENTATION) + for (j = 0; j < 1024; j += 4) { + m[i] = (unsigned char)(pk[j] & 0xFF); + m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6)); + m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF); + m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4)); + m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF); + m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2)); + m[i+6] = (unsigned char)(pk[j+3] >> 6); + i += 7; + } + +#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + encode_asm(pk, m); + i = 1792; +#endif + + for (j = 0; j < 32; j++) { + m[i+j] = seed[j]; + } +} + + +void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed) +{ // Alice's message decoding + unsigned int i = 0, j; + +#if defined(GENERIC_IMPLEMENTATION) + for (j = 0; j < 1024; j += 4) { + pk[j] = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8)); + pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10)); + pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12)); + pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6)); + i += 7; + } + +#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + decode_asm(m, pk); + i = 1792; +#endif + + for (j = 0; j < 32; j++) { + seed[j] = m[i+j]; + } +} + + +void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m) +{ // Bob's message encoding + unsigned int i = 0, j; + +#if defined(GENERIC_IMPLEMENTATION) + for (j = 0; j < 1024; j += 4) { + m[i] = (unsigned char)(pk[j] & 0xFF); + m[i+1] = (unsigned char)((pk[j] >> 8) | ((pk[j+1] & 0x03) << 6)); + m[i+2] = (unsigned char)((pk[j+1] >> 2) & 0xFF); + m[i+3] = (unsigned char)((pk[j+1] >> 10) | ((pk[j+2] & 0x0F) << 4)); + m[i+4] = (unsigned char)((pk[j+2] >> 4) & 0xFF); + m[i+5] = (unsigned char)((pk[j+2] >> 12) | ((pk[j+3] & 0x3F) << 2)); + m[i+6] = (unsigned char)(pk[j+3] >> 6); + i += 7; + } + +#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + encode_asm(pk, m); +#endif + + i = 0; + for (j = 0; j < 1024/4; j++) { + m[1792+j] = (unsigned char)(rvec[i] | (rvec[i+1] << 2) | (rvec[i+2] << 4) | (rvec[i+3] << 6)); + i += 4; + } +} + + +void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec) +{ // Bob's message decoding + unsigned int i = 0, j; + +#if defined(GENERIC_IMPLEMENTATION) + for (j = 0; j < 1024; j += 4) { + pk[j] = ((uint32_t)m[i] | (((uint32_t)m[i+1] & 0x3F) << 8)); + pk[j+1] = (((uint32_t)m[i+1] >> 6) | ((uint32_t)m[i+2] << 2) | (((uint32_t)m[i+3] & 0x0F) << 10)); + pk[j+2] = (((uint32_t)m[i+3] >> 4) | ((uint32_t)m[i+4] << 4) | (((uint32_t)m[i+5] & 0x03) << 12)); + pk[j+3] = (((uint32_t)m[i+5] >> 2) | ((uint32_t)m[i+6] << 6)); + i += 7; + } + +#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + decode_asm(m, pk); + i = 1792; +#endif + + i = 0; + for (j = 0; j < 1024/4; j++) { + rvec[i] = (uint32_t)(m[1792+j] & 0x03); + rvec[i+1] = (uint32_t)((m[1792+j] >> 2) & 0x03); + rvec[i+2] = (uint32_t)((m[1792+j] >> 4) & 0x03); + rvec[i+3] = (uint32_t)(m[1792+j] >> 6); + i += 4; + } +} + + +static __inline uint32_t Abs(int32_t value) +{ // Compute absolute value + uint32_t mask; + + mask = (uint32_t)(value >> 31); + return ((mask ^ value) - mask); +} + + +CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction) +{ // Reconciliation helper + (void)seed; (void)nonce; (void)StreamOutputFunction; + unsigned int i, j, norm; + unsigned char bit, random_bits[32]; + uint32_t v0[4], v1[4]; + + randombytes( random_bits, 32); + CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS; + +#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + helprec_asm(x, rvec, random_bits); +#else + + for (i = 0; i < 256; i++) { + bit = 1 & (random_bits[i >> 3] >> (i & 0x07)); + rvec[i] = (x[i] << 1) - bit; + rvec[i+256] = (x[i+256] << 1) - bit; + rvec[i+512] = (x[i+512] << 1) - bit; + rvec[i+768] = (x[i+768] << 1) - bit; + + norm = 0; + v0[0] = 4; v0[1] = 4; v0[2] = 4; v0[3] = 4; + v1[0] = 3; v1[1] = 3; v1[2] = 3; v1[3] = 3; + for (j = 0; j < 4; j++) { + v0[j] -= (rvec[i+256*j] - PARAMETER_Q4 ) >> 31; + v0[j] -= (rvec[i+256*j] - PARAMETER_3Q4) >> 31; + v0[j] -= (rvec[i+256*j] - PARAMETER_5Q4) >> 31; + v0[j] -= (rvec[i+256*j] - PARAMETER_7Q4) >> 31; + v1[j] -= (rvec[i+256*j] - PARAMETER_Q2 ) >> 31; + v1[j] -= (rvec[i+256*j] - PARAMETER_Q ) >> 31; + v1[j] -= (rvec[i+256*j] - PARAMETER_3Q2) >> 31; + norm += Abs(2*rvec[i+256*j] - PARAMETER_Q*v0[j]); + } + + norm = (uint32_t)((int32_t)(norm - PARAMETER_Q) >> 31); // If norm < q then norm = 0xff...ff, else norm = 0 + v0[0] = (norm & (v0[0] ^ v1[0])) ^ v1[0]; + v0[1] = (norm & (v0[1] ^ v1[1])) ^ v1[1]; + v0[2] = (norm & (v0[2] ^ v1[2])) ^ v1[2]; + v0[3] = (norm & (v0[3] ^ v1[3])) ^ v1[3]; + rvec[i] = (v0[0] - v0[3]) & 0x03; + rvec[i+256] = (v0[1] - v0[3]) & 0x03; + rvec[i+512] = (v0[2] - v0[3]) & 0x03; + rvec[i+768] = ((v0[3] << 1) + (1 & ~norm)) & 0x03; + } +#endif + + return Status; +} + + +static __inline uint32_t LDDecode(int32_t* t) +{ // Low-density decoding + unsigned int i, norm = 0; + uint32_t mask1, mask2, value; + int32_t cneg = -8*PARAMETER_Q; + + for (i = 0; i < 4; i++) { + mask1 = t[i] >> 31; // If t[i] < 0 then mask2 = 0xff...ff, else mask2 = 0 + mask2 = (4*PARAMETER_Q - (int32_t)Abs(t[i])) >> 31; // If 4*PARAMETER_Q > Abs(t[i]) then mask2 = 0, else mask2 = 0xff...ff + + value = ((mask1 & (8*PARAMETER_Q ^ cneg)) ^ cneg); + norm += Abs(t[i] + (mask2 & value)); + } + + return ((8*PARAMETER_Q - norm) >> 31) ^ 1; // If norm < PARAMETER_Q then return 1, else return 0 +} + + +void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key) +{ // Reconciliation + +#if defined(GENERIC_IMPLEMENTATION) + unsigned int i; + uint32_t t[4]; + + for (i = 0; i < 32; i++) { + key[i] = 0; + } + for (i = 0; i < 256; i++) { + t[0] = 8*x[i] - (2*rvec[i] + rvec[i+768]) * PARAMETER_Q; + t[1] = 8*x[i+256] - (2*rvec[i+256] + rvec[i+768]) * PARAMETER_Q; + t[2] = 8*x[i+512] - (2*rvec[i+512] + rvec[i+768]) * PARAMETER_Q; + t[3] = 8*x[i+768] - (rvec[i+768]) * PARAMETER_Q; + + key[i >> 3] |= (unsigned char)LDDecode((int32_t*)t) << (i & 0x07); + } + +#elif defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + rec_asm(x, rvec, key); +#endif +} + + +CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction) +{ // Error sampling + (void) seed; (void) nonce; (void) StreamOutputFunction; + unsigned char stream[3 * PARAMETER_N]; + uint32_t *pstream = (uint32_t *) &stream; + uint32_t acc1, acc2, temp; + uint8_t *pacc1 = (uint8_t *) &acc1, *pacc2 = (uint8_t *) &acc2; + unsigned int i, j; + + randombytes( stream, 3 * PARAMETER_N); + +#if defined(ASM_SUPPORT) && (SIMD_SUPPORT == AVX2_SUPPORT) + error_sampling_asm(stream, e); +#else + for (i = 0; i < PARAMETER_N / 4; i++) { + acc1 = 0; + acc2 = 0; + for (j = 0; j < 8; j++) { + acc1 += (pstream[i] >> j) & 0x01010101; + acc2 += (pstream[i + PARAMETER_N / 4] >> j) & 0x01010101; + } + for (j = 0; j < 4; j++) { + temp = pstream[i + 2 * PARAMETER_N / 4] >> j; + acc1 += temp & 0x01010101; + acc2 += (temp >> 4) & 0x01010101; + } + e[2 * i] = pacc1[0] - pacc1[1]; + e[2 * i + 1] = pacc1[2] - pacc1[3]; + e[2 * i + PARAMETER_N / 2] = pacc2[0] - pacc2[1]; + e[2 * i + PARAMETER_N / 2 + 1] = pacc2[2] - pacc2[3]; + } +#endif + + return CRYPTO_MSRLN_SUCCESS; +} + + +CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction) +{ // Generation of parameter a + (void)ExtendableOutputFunction; + unsigned int pos = 0, ctr = 0; + uint16_t val; + unsigned int nblocks = 16; + uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init + //Keccak_HashInstance ks; + + uint64_t state[SHA3_STATESIZE] = {0}; + shake128_absorb(state, seed, SEED_BYTES); + shake128_squeezeblocks((unsigned char *) buf, nblocks, state); + + /*#ifdef _WIN32 + SHAKE128_InitAbsorb( &ks, seed, SEED_BYTES ); + KECCAK_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 ); + #else + Keccak_HashInitialize_SHAKE128(&ks); + Keccak_HashUpdate( &ks, seed, SEED_BYTES * 8 ); + Keccak_HashFinal( &ks, seed ); + Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 ); + //#endif + */ + while (ctr < PARAMETER_N) { + val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff; + if (val < PARAMETER_Q) { + a[ctr++] = val; + } + pos += 2; + if (pos > SHAKE128_RATE * nblocks - 2) { + nblocks = 1; + shake128_squeezeblocks((unsigned char *) buf, nblocks, state); +// Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 ); + pos = 0; + } + } + + return CRYPTO_MSRLN_SUCCESS; +} + + +CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto) +{ // Alice's key generation + // It produces a private key SecretKeyA and computes the public key PublicKeyA. + // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) + // the public key PublicKeyA that occupies 1824 bytes + // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). + uint32_t a[PARAMETER_N]; + int32_t e[PARAMETER_N]; + unsigned char seed[SEED_BYTES]; + unsigned char error_seed[ERROR_SEED_BYTES]; + CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN; + + Status = randombytes( seed, SEED_BYTES); + + if (Status != CRYPTO_MSRLN_SUCCESS) { + return Status; + } + + Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + + Status = get_error(SecretKeyA, error_seed, 0, pLatticeCrypto->StreamOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + NTT_CT_std2rev_12289(SecretKeyA, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); + NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); + smul(e, 3, PARAMETER_N); + + pmuladd((int32_t*)a, SecretKeyA, e, (int32_t*)a, PARAMETER_N); + correction((int32_t*)a, PARAMETER_Q, PARAMETER_N); + encode_A(a, seed, PublicKeyA); + +cleanup: + clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N)); + clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES)); + + return Status; +} + + +CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto) +{ // Bob's key generation and shared secret computation + // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes + // the shared secret SharedSecretB. + // Input: Alice's public key PublicKeyA that consists of 1824 bytes + // Outputs: the public key PublicKeyB that occupies 2048 bytes. + // the 256-bit shared secret SharedSecretB. + // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). + uint32_t pk_A[PARAMETER_N], a[PARAMETER_N], v[PARAMETER_N], r[PARAMETER_N]; + int32_t sk_B[PARAMETER_N], e[PARAMETER_N]; + unsigned char seed[SEED_BYTES], error_seed[ERROR_SEED_BYTES]; + CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_ERROR_UNKNOWN; + + decode_A(PublicKeyA, pk_A, seed); + + Status = generate_a(a, seed, pLatticeCrypto->ExtendableOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + + Status = get_error(sk_B, error_seed, 0, pLatticeCrypto->StreamOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + Status = get_error(e, error_seed, 1, pLatticeCrypto->StreamOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + NTT_CT_std2rev_12289(sk_B, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); + NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); + smul(e, 3, PARAMETER_N); + + pmuladd((int32_t*)a, sk_B, e, (int32_t*)a, PARAMETER_N); + correction((int32_t*)a, PARAMETER_Q, PARAMETER_N); + + Status = get_error(e, error_seed, 2, pLatticeCrypto->StreamOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + NTT_CT_std2rev_12289(e, MSRLN_psi_rev_ntt1024_12289, PARAMETER_N); + smul(e, 81, PARAMETER_N); + + pmuladd((int32_t*)pk_A, sk_B, e, (int32_t*)v, PARAMETER_N); + INTT_GS_rev2std_12289((int32_t*)v, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N); + two_reduce12289((int32_t*)v, PARAMETER_N); +#if defined(GENERIC_IMPLEMENTATION) + correction((int32_t*)v, PARAMETER_Q, PARAMETER_N); +#endif + + Status = HelpRec(v, r, error_seed, 3, pLatticeCrypto->StreamOutputFunction); + if (Status != CRYPTO_MSRLN_SUCCESS) { + goto cleanup; + } + Rec(v, r, SharedSecretB); + encode_B(a, r, PublicKeyB); + +cleanup: + clear_words((void*)sk_B, NBYTES_TO_NWORDS(4*PARAMETER_N)); + clear_words((void*)e, NBYTES_TO_NWORDS(4*PARAMETER_N)); + clear_words((void*)error_seed, NBYTES_TO_NWORDS(ERROR_SEED_BYTES)); + clear_words((void*)a, NBYTES_TO_NWORDS(4*PARAMETER_N)); + clear_words((void*)v, NBYTES_TO_NWORDS(4*PARAMETER_N)); + clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N)); + + return Status; +} + + +CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA) +{ // Alice's shared secret computation + // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. + // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes + // the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) + // Output: the 256-bit shared secret SharedSecretA. + // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). + uint32_t u[PARAMETER_N], r[PARAMETER_N]; + CRYPTO_MSRLN_STATUS Status = CRYPTO_MSRLN_SUCCESS; + + decode_B(PublicKeyB, u, r); + + pmul(SecretKeyA, (int32_t*)u, (int32_t*)u, PARAMETER_N); + INTT_GS_rev2std_12289((int32_t*)u, MSRLN_omegainv_rev_ntt1024_12289, MSRLN_omegainv10N_rev_ntt1024_12289, MSRLN_Ninv11_ntt1024_12289, PARAMETER_N); + two_reduce12289((int32_t*)u, PARAMETER_N); +#if defined(GENERIC_IMPLEMENTATION) + correction((int32_t*)u, PARAMETER_Q, PARAMETER_N); +#endif + + Rec(u, r, SharedSecretA); + +// Cleanup + clear_words((void*)u, NBYTES_TO_NWORDS(4*PARAMETER_N)); + clear_words((void*)r, NBYTES_TO_NWORDS(4*PARAMETER_N)); + + return Status; +} diff --git a/dap-sdk/crypto/src/msrln/makefile b/dap-sdk/crypto/src/msrln/makefile index ab4cb800cc..d017a0e2bb 100755 --- a/dap-sdk/crypto/src/msrln/makefile +++ b/dap-sdk/crypto/src/msrln/makefile @@ -1,94 +1,94 @@ -#### Makefile for compilation on Linux #### - -OPT=-O3 # Optimization option by default - -ifeq "$(CC)" "gcc" - COMPILER=gcc -else ifeq "$(CC)" "clang" - COMPILER=clang -endif - -ifeq "$(ARCH)" "x64" - ARCHITECTURE=_AMD64_ -else ifeq "$(ARCH)" "x86" - ARCHITECTURE=_X86_ -else ifeq "$(ARCH)" "ARM" - ARCHITECTURE=_ARM_ -endif - -ADDITIONAL_SETTINGS= -ifeq "$(SET)" "EXTENDED" - ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native -endif - -ifeq "$(ASM)" "TRUE" - USE_ASM=-D _ASM_ -endif - -ifeq "$(GENERIC)" "TRUE" - USE_GENERIC=-D _GENERIC_ -endif - -ifeq "$(AVX2)" "TRUE" - USE_AVX2=-D _AVX2_ - SIMD=-mavx2 -endif - -ifeq "$(ARCH)" "ARM" - ARM_SETTING=-lrt -endif - -cc=$(COMPILER) -CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC) -LDFLAGS= -ifeq "$(GENERIC)" "TRUE" - OTHER_OBJECTS=ntt.o -else -ifeq "$(ASM)" "TRUE" - OTHER_OBJECTS=ntt_x64.o consts.o - ASM_OBJECTS=ntt_x64_asm.o error_asm.o -endif -endif -OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS) -OBJECTS_TEST=tests.o test_extras.o $(OBJECTS) -OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST) - -test: $(OBJECTS_TEST) - $(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING) - -kex.o: kex.c LatticeCrypto_priv.h - $(CC) $(CFLAGS) kex.c - -random.o: random.c LatticeCrypto_priv.h - $(CC) $(CFLAGS) random.c - -ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h - $(CC) $(CFLAGS) ntt_constants.c - -ifeq "$(GENERIC)" "TRUE" - ntt.o: generic/ntt.c LatticeCrypto_priv.h - $(CC) $(CFLAGS) generic/ntt.c -else -ifeq "$(ASM)" "TRUE" - ntt_x64.o: AMD64/ntt_x64.c - $(CC) $(CFLAGS) AMD64/ntt_x64.c - ntt_x64_asm.o: AMD64/ntt_x64_asm.S - $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S - error_asm.o: AMD64/error_asm.S - $(CC) $(CFLAGS) AMD64/error_asm.S - consts.o: AMD64/consts.c - $(CC) $(CFLAGS) AMD64/consts.c -endif -endif - -test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h - $(CC) $(CFLAGS) tests/test_extras.c - -tests.o: tests/tests.c LatticeCrypto_priv.h - $(CC) $(CFLAGS) tests/tests.c - -.PHONY: clean - -clean: - rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL) - +#### Makefile for compilation on Linux #### + +OPT=-O3 # Optimization option by default + +ifeq "$(CC)" "gcc" + COMPILER=gcc +else ifeq "$(CC)" "clang" + COMPILER=clang +endif + +ifeq "$(ARCH)" "x64" + ARCHITECTURE=_AMD64_ +else ifeq "$(ARCH)" "x86" + ARCHITECTURE=_X86_ +else ifeq "$(ARCH)" "ARM" + ARCHITECTURE=_ARM_ +endif + +ADDITIONAL_SETTINGS= +ifeq "$(SET)" "EXTENDED" + ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native +endif + +ifeq "$(ASM)" "TRUE" + USE_ASM=-D _ASM_ +endif + +ifeq "$(GENERIC)" "TRUE" + USE_GENERIC=-D _GENERIC_ +endif + +ifeq "$(AVX2)" "TRUE" + USE_AVX2=-D _AVX2_ + SIMD=-mavx2 +endif + +ifeq "$(ARCH)" "ARM" + ARM_SETTING=-lrt +endif + +cc=$(COMPILER) +CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC) +LDFLAGS= +ifeq "$(GENERIC)" "TRUE" + OTHER_OBJECTS=ntt.o +else +ifeq "$(ASM)" "TRUE" + OTHER_OBJECTS=ntt_x64.o consts.o + ASM_OBJECTS=ntt_x64_asm.o error_asm.o +endif +endif +OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS) +OBJECTS_TEST=tests.o test_extras.o $(OBJECTS) +OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST) + +test: $(OBJECTS_TEST) + $(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING) + +kex.o: kex.c LatticeCrypto_priv.h + $(CC) $(CFLAGS) kex.c + +random.o: random.c LatticeCrypto_priv.h + $(CC) $(CFLAGS) random.c + +ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h + $(CC) $(CFLAGS) ntt_constants.c + +ifeq "$(GENERIC)" "TRUE" + ntt.o: generic/ntt.c LatticeCrypto_priv.h + $(CC) $(CFLAGS) generic/ntt.c +else +ifeq "$(ASM)" "TRUE" + ntt_x64.o: AMD64/ntt_x64.c + $(CC) $(CFLAGS) AMD64/ntt_x64.c + ntt_x64_asm.o: AMD64/ntt_x64_asm.S + $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S + error_asm.o: AMD64/error_asm.S + $(CC) $(CFLAGS) AMD64/error_asm.S + consts.o: AMD64/consts.c + $(CC) $(CFLAGS) AMD64/consts.c +endif +endif + +test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h + $(CC) $(CFLAGS) tests/test_extras.c + +tests.o: tests/tests.c LatticeCrypto_priv.h + $(CC) $(CFLAGS) tests/tests.c + +.PHONY: clean + +clean: + rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL) + diff --git a/dap-sdk/crypto/src/msrln/msrln.h b/dap-sdk/crypto/src/msrln/msrln.h index 5b54822603..b789d0209a 100755 --- a/dap-sdk/crypto/src/msrln/msrln.h +++ b/dap-sdk/crypto/src/msrln/msrln.h @@ -1,136 +1,136 @@ -#ifndef __MSRLN_H__ -#define __MSRLN_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - -#include <stdint.h> -#include <stdbool.h> -#include <stddef.h> -#include "dap_crypto_common.h" - -// Definitions of the error-handling type and error codes - -typedef enum { - CRYPTO_MSRLN_SUCCESS, // 0x00 - CRYPTO_MSRLN_ERROR, // 0x01 - CRYPTO_MSRLN_ERROR_DURING_TEST, // 0x02 - CRYPTO_MSRLN_ERROR_UNKNOWN, // 0x03 - CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, // 0x04 - CRYPTO_MSRLN_ERROR_NO_MEMORY, // 0x05 - CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, // 0x06 - CRYPTO_MSRLN_ERROR_SHARED_KEY, // 0x07 - CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, // 0x08 - CRYPTO_MSRLN_ERROR_END_OF_LIST -} CRYPTO_MSRLN_STATUS; - -#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_MSRLN_ERROR_END_OF_LIST) - - -// Definitions of the error messages -// NOTE: they must match the error codes above - -#define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS" -#define CRYPTO_MSG_ERROR "CRYPTO_ERROR" -#define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST" -#define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN" -#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED" -#define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY" -#define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER" -#define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY" -#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS" - - -// Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array" -typedef CRYPTO_MSRLN_STATUS (*RandomBytes)(unsigned char* random_array, unsigned int nbytes); - -// Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array" -typedef CRYPTO_MSRLN_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array); - -// Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array" -typedef CRYPTO_MSRLN_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array); - - -// Basic key-exchange constants -#define MSRLN_PKA_BYTES 1824 // Alice's public key size -#define MSRLN_PKB_BYTES 2048 // Bob's public key size -#define MSRLN_SHAREDKEY_BYTES 32 // Shared key size - - -// This data struct is initialized during setup with user-provided functions -typedef struct -{ - RandomBytes RandomBytesFunction; // Function providing random bytes - ExtendableOutput ExtendableOutputFunction; // Extendable output function - StreamOutput StreamOutputFunction; // Stream cipher function -} LatticeCryptoStruct, *PLatticeCryptoStruct; - - -/******************** Function prototypes *******************/ -/*********************** Auxiliary API **********************/ - -// Clear digits from memory. "nwords" indicates the number of digits to be zeroed. -extern void clear_words(void* mem, digit_t nwords); -CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array); -CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a); - -// Output "nbytes" of random values. -// It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array". -// The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets. -CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction); - -// Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes". -// It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array". -// The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. -CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction); - -// Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes". -// It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array". -// The caller is responsible for providing the "StreamOutputFunction" function passing values as octets. -CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction); - -// Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error. -PLatticeCryptoStruct LatticeCrypto_allocate(void); - -// Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction. -CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction); - -// Output error/success message for a given CRYPTO_STATUS -const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status); - -/*********************** Key exchange API ***********************/ - -// Alice's key generation -// It produces a private key SecretKeyA and computes the public key PublicKeyA. -// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) -// the public key PublicKeyA that occupies 1824 bytes -// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). -CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto); - -// Bob's key generation and shared secret computation -// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes -// the shared secret SharedSecretB. -// Input: Alice's public key PublicKeyA that consists of 1824 bytes -// Outputs: the public key PublicKeyB that occupies 2048 bytes. -// the 256-bit shared secret SharedSecretB. -// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). -CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto); - -// Alice's shared secret computation -// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. -// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes -// the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) -// Output: the 256-bit shared secret SharedSecretA. -// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). -CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA); - - -#ifdef __cplusplus -} -#endif - - -#endif +#ifndef __MSRLN_H__ +#define __MSRLN_H__ + + +// For C++ +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <stdbool.h> +#include <stddef.h> +#include "dap_crypto_common.h" + +// Definitions of the error-handling type and error codes + +typedef enum { + CRYPTO_MSRLN_SUCCESS, // 0x00 + CRYPTO_MSRLN_ERROR, // 0x01 + CRYPTO_MSRLN_ERROR_DURING_TEST, // 0x02 + CRYPTO_MSRLN_ERROR_UNKNOWN, // 0x03 + CRYPTO_MSRLN_ERROR_NOT_IMPLEMENTED, // 0x04 + CRYPTO_MSRLN_ERROR_NO_MEMORY, // 0x05 + CRYPTO_MSRLN_ERROR_INVALID_PARAMETER, // 0x06 + CRYPTO_MSRLN_ERROR_SHARED_KEY, // 0x07 + CRYPTO_MSRLN_ERROR_TOO_MANY_ITERATIONS, // 0x08 + CRYPTO_MSRLN_ERROR_END_OF_LIST +} CRYPTO_MSRLN_STATUS; + +#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_MSRLN_ERROR_END_OF_LIST) + + +// Definitions of the error messages +// NOTE: they must match the error codes above + +#define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS" +#define CRYPTO_MSG_ERROR "CRYPTO_ERROR" +#define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST" +#define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN" +#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED" +#define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY" +#define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER" +#define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY" +#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS" + + +// Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array" +typedef CRYPTO_MSRLN_STATUS (*RandomBytes)(unsigned char* random_array, unsigned int nbytes); + +// Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array" +typedef CRYPTO_MSRLN_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array); + +// Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array" +typedef CRYPTO_MSRLN_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array); + + +// Basic key-exchange constants +#define MSRLN_PKA_BYTES 1824 // Alice's public key size +#define MSRLN_PKB_BYTES 2048 // Bob's public key size +#define MSRLN_SHAREDKEY_BYTES 32 // Shared key size + + +// This data struct is initialized during setup with user-provided functions +typedef struct +{ + RandomBytes RandomBytesFunction; // Function providing random bytes + ExtendableOutput ExtendableOutputFunction; // Extendable output function + StreamOutput StreamOutputFunction; // Stream cipher function +} LatticeCryptoStruct, *PLatticeCryptoStruct; + + +/******************** Function prototypes *******************/ +/*********************** Auxiliary API **********************/ + +// Clear digits from memory. "nwords" indicates the number of digits to be zeroed. +extern void clear_words(void* mem, digit_t nwords); +CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array); +CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a); + +// Output "nbytes" of random values. +// It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array". +// The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets. +CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction); + +// Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes". +// It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array". +// The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. +CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction); + +// Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes". +// It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array". +// The caller is responsible for providing the "StreamOutputFunction" function passing values as octets. +CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction); + +// Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error. +PLatticeCryptoStruct LatticeCrypto_allocate(void); + +// Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction. +CRYPTO_MSRLN_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction); + +// Output error/success message for a given CRYPTO_STATUS +const char* LatticeCrypto_get_error_message(CRYPTO_MSRLN_STATUS Status); + +/*********************** Key exchange API ***********************/ + +// Alice's key generation +// It produces a private key SecretKeyA and computes the public key PublicKeyA. +// Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) +// the public key PublicKeyA that occupies 1824 bytes +// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). +CRYPTO_MSRLN_STATUS MSRLN_KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto); + +// Bob's key generation and shared secret computation +// It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes +// the shared secret SharedSecretB. +// Input: Alice's public key PublicKeyA that consists of 1824 bytes +// Outputs: the public key PublicKeyB that occupies 2048 bytes. +// the 256-bit shared secret SharedSecretB. +// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). +CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto); + +// Alice's shared secret computation +// It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. +// Inputs: Bob's public key PublicKeyB that consists of 2048 bytes +// the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) +// Output: the 256-bit shared secret SharedSecretA. +// pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). +CRYPTO_MSRLN_STATUS MSRLN_SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA); + + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/dap-sdk/crypto/src/msrln/msrln.pri b/dap-sdk/crypto/src/msrln/msrln.pri index cd4600ef3d..f42be38c96 100755 --- a/dap-sdk/crypto/src/msrln/msrln.pri +++ b/dap-sdk/crypto/src/msrln/msrln.pri @@ -1,6 +1,6 @@ -INCLUDEPATH += $$PWD - -HEADERS += $$PWD/msrln.h \ - -SOURCES += $$PWD/kex.c \ - $$PWD/random.c \ +INCLUDEPATH += $$PWD + +HEADERS += $$PWD/msrln.h \ + +SOURCES += $$PWD/kex.c \ + $$PWD/random.c \ diff --git a/dap-sdk/crypto/src/msrln/msrln_priv.h b/dap-sdk/crypto/src/msrln/msrln_priv.h index fdaae50ad3..cc1f198010 100755 --- a/dap-sdk/crypto/src/msrln/msrln_priv.h +++ b/dap-sdk/crypto/src/msrln/msrln_priv.h @@ -1,114 +1,114 @@ -#ifndef __MSRLN_priv_H__ -#define __MSRLN_priv_H__ - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - -#include "msrln.h" - -// Basic constants -#define PARAMETER_N 1024 -#define PARAMETER_Q 12289 -#define SEED_BYTES 256/8 -#define ERROR_SEED_BYTES 256/8 -#define NONCE_SEED_BYTES 256/8 -#define PARAMETER_Q4 3073 -#define PARAMETER_3Q4 9217 -#define PARAMETER_5Q4 15362 -#define PARAMETER_7Q4 21506 -#define PARAMETER_Q2 6145 -#define PARAMETER_3Q2 18434 - - -// Macro definitions - -#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words -#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words - -// Macro to avoid compiler warnings when detecting unreferenced parameters -#ifndef UNREFERENCED_PARAMETER -#define UNREFERENCED_PARAMETER(PAR) ((void)PAR) -#endif - - -/******************** Function prototypes *******************/ -/******************* Polynomial functions *******************/ - -// Forward NTT -void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N); -void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N); - -// Inverse NTT -void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); -void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); - -// Reduction modulo q -int32_t reduce12289(int64_t a); - -// Two merged reductions modulo q -int32_t reduce12289_2x(int64_t a); - -// Two consecutive reductions modulo q -void two_reduce12289(int32_t* a, unsigned int N); -void two_reduce12289_asm(int32_t* a, unsigned int N); - -// Correction modulo q -void correction(int32_t* a, int32_t p, unsigned int N); - -// Component-wise multiplication -void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N); -void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N); - -// Component-wise multiplication and addition -void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N); -void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N); - -// Component-wise multiplication with scalar -void smul(int32_t* a, int32_t scalar, unsigned int N); - -/******************* Key exchange functions *******************/ - -// Alice's message encoding -void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m); - -// Alice's message decoding -void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); - -// Bob's message encoding -void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m); - -// Bob's message decoding -void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec); - -// Partial message encoding/decoding (assembly optimized) -void encode_asm(const uint32_t* pk, unsigned char* m); -void decode_asm(const unsigned char* m, uint32_t *pk); - -// Reconciliation helper -CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction); - -// Partial reconciliation helper (assembly optimized) -void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits); - -// Reconciliation -void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key); -void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key); - -// Error sampling -CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction); - -// Partial error sampling (assembly optimized) -void error_sampling_asm(unsigned char* stream, int32_t* e); - -// Generation of parameter a -CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction); - - -#ifdef __cplusplus -} -#endif - - -#endif +#ifndef __MSRLN_priv_H__ +#define __MSRLN_priv_H__ + +// For C++ +#ifdef __cplusplus +extern "C" { +#endif + +#include "msrln.h" + +// Basic constants +#define PARAMETER_N 1024 +#define PARAMETER_Q 12289 +#define SEED_BYTES 256/8 +#define ERROR_SEED_BYTES 256/8 +#define NONCE_SEED_BYTES 256/8 +#define PARAMETER_Q4 3073 +#define PARAMETER_3Q4 9217 +#define PARAMETER_5Q4 15362 +#define PARAMETER_7Q4 21506 +#define PARAMETER_Q2 6145 +#define PARAMETER_3Q2 18434 + + +// Macro definitions + +#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words +#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words + +// Macro to avoid compiler warnings when detecting unreferenced parameters +#ifndef UNREFERENCED_PARAMETER +#define UNREFERENCED_PARAMETER(PAR) ((void)PAR) +#endif + + +/******************** Function prototypes *******************/ +/******************* Polynomial functions *******************/ + +// Forward NTT +void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N); +void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N); + +// Inverse NTT +void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); +void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); + +// Reduction modulo q +int32_t reduce12289(int64_t a); + +// Two merged reductions modulo q +int32_t reduce12289_2x(int64_t a); + +// Two consecutive reductions modulo q +void two_reduce12289(int32_t* a, unsigned int N); +void two_reduce12289_asm(int32_t* a, unsigned int N); + +// Correction modulo q +void correction(int32_t* a, int32_t p, unsigned int N); + +// Component-wise multiplication +void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N); +void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N); + +// Component-wise multiplication and addition +void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N); +void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N); + +// Component-wise multiplication with scalar +void smul(int32_t* a, int32_t scalar, unsigned int N); + +/******************* Key exchange functions *******************/ + +// Alice's message encoding +void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m); + +// Alice's message decoding +void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); + +// Bob's message encoding +void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m); + +// Bob's message decoding +void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec); + +// Partial message encoding/decoding (assembly optimized) +void encode_asm(const uint32_t* pk, unsigned char* m); +void decode_asm(const unsigned char* m, uint32_t *pk); + +// Reconciliation helper +CRYPTO_MSRLN_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction); + +// Partial reconciliation helper (assembly optimized) +void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits); + +// Reconciliation +void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key); +void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key); + +// Error sampling +CRYPTO_MSRLN_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction); + +// Partial error sampling (assembly optimized) +void error_sampling_asm(unsigned char* stream, int32_t* e); + +// Generation of parameter a +CRYPTO_MSRLN_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction); + + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/dap-sdk/crypto/src/msrln/random.c b/dap-sdk/crypto/src/msrln/random.c index ab2b129f84..eaea6a1170 100755 --- a/dap-sdk/crypto/src/msrln/random.c +++ b/dap-sdk/crypto/src/msrln/random.c @@ -1,90 +1,90 @@ -#include "msrln_priv.h" - -//#include "KeccakHash.h" -//#include "SimpleFIPS202.h" - -#define LOG_TAG "RANDOM" - -CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a) -{ - // Generation of parameter a - unsigned int pos = 0, ctr = 0; - uint16_t val; - unsigned int nblocks = 16; - uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init - //Keccak_HashInstance ks; - - uint64_t state[SHA3_STATESIZE]; - shake128_absorb(state, seed, seed_nbytes); - shake128_squeezeblocks((unsigned char *) buf, nblocks, state); - - /*Keccak_HashInitialize_SHAKE128(&ks); - Keccak_HashUpdate( &ks, seed, seed_nbytes * 8 ); - Keccak_HashFinal( &ks, seed ); - Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );*/ - - while (ctr < array_ndigits) { - val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff; - if (val < PARAMETER_Q) { - a[ctr++] = val; - } - pos += 2; - if (pos > SHAKE128_RATE * nblocks - 2) { - nblocks = 1; - shake128_squeezeblocks((unsigned char *) buf, nblocks, state); -// Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 ); - pos = 0; - } - } - return CRYPTO_MSRLN_SUCCESS; -} - -CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array) -{ - UNREFERENCED_PARAMETER(seed); - UNREFERENCED_PARAMETER(seed_nbytes); - UNREFERENCED_PARAMETER(nonce); - UNREFERENCED_PARAMETER(nonce_nbytes); - - randombytes( stream_array, array_nbytes); - - return CRYPTO_MSRLN_SUCCESS; -} - -CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction) -{ // Output "nbytes" of random values. - // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array". - // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets. - - if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) { - return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER; - } - - return (RandomBytesFunction)(random_array, nbytes); -} - - -CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction) -{ // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes". - // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array". - // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. - - if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) { - return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER; - } - - return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array); -} - - -CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction) -{ // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes". - // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array". - // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets. - - if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) { - return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER; - } - - return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array); -} +#include "msrln_priv.h" + +//#include "KeccakHash.h" +//#include "SimpleFIPS202.h" + +#define LOG_TAG "RANDOM" + +CRYPTO_MSRLN_STATUS MSRLN_generate_a(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* a) +{ + // Generation of parameter a + unsigned int pos = 0, ctr = 0; + uint16_t val; + unsigned int nblocks = 16; + uint8_t buf[SHAKE128_RATE * 16]; // was * nblocks, but VS doesn't like this buf init + //Keccak_HashInstance ks; + + uint64_t state[SHA3_STATESIZE]; + shake128_absorb(state, seed, seed_nbytes); + shake128_squeezeblocks((unsigned char *) buf, nblocks, state); + + /*Keccak_HashInitialize_SHAKE128(&ks); + Keccak_HashUpdate( &ks, seed, seed_nbytes * 8 ); + Keccak_HashFinal( &ks, seed ); + Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 );*/ + + while (ctr < array_ndigits) { + val = (buf[pos] | ((uint16_t) buf[pos + 1] << 8)) & 0x3fff; + if (val < PARAMETER_Q) { + a[ctr++] = val; + } + pos += 2; + if (pos > SHAKE128_RATE * nblocks - 2) { + nblocks = 1; + shake128_squeezeblocks((unsigned char *) buf, nblocks, state); +// Keccak_HashSqueeze( &ks, (unsigned char *) buf, nblocks * 8 * 8 ); + pos = 0; + } + } + return CRYPTO_MSRLN_SUCCESS; +} + +CRYPTO_MSRLN_STATUS MSRLN_get_error(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array) +{ + UNREFERENCED_PARAMETER(seed); + UNREFERENCED_PARAMETER(seed_nbytes); + UNREFERENCED_PARAMETER(nonce); + UNREFERENCED_PARAMETER(nonce_nbytes); + + randombytes( stream_array, array_nbytes); + + return CRYPTO_MSRLN_SUCCESS; +} + +CRYPTO_MSRLN_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction) +{ // Output "nbytes" of random values. + // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array". + // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets. + + if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) { + return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER; + } + + return (RandomBytesFunction)(random_array, nbytes); +} + + +CRYPTO_MSRLN_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction) +{ // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes". + // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array". + // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. + + if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) { + return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER; + } + + return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array); +} + + +CRYPTO_MSRLN_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction) +{ // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes". + // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array". + // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets. + + if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) { + return CRYPTO_MSRLN_ERROR_INVALID_PARAMETER; + } + + return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array); +} diff --git a/dap-sdk/crypto/src/sig_picnic/picnic_impl.c b/dap-sdk/crypto/src/sig_picnic/picnic_impl.c index 2a0e25ffe8..4e9b1a329b 100755 --- a/dap-sdk/crypto/src/sig_picnic/picnic_impl.c +++ b/dap-sdk/crypto/src/sig_picnic/picnic_impl.c @@ -1,998 +1,998 @@ -/*! @file picnic_impl.c - * @brief This is the main file of the signature scheme. All of the LowMC MPC - * code is here as well as lower-level versions of sign and verify that are - * called by the signature API. - * - * This file is part of the reference implementation of the Picnic signature scheme. - * See the accompanying documentation for complete details. - * - * The code is provided under the MIT license, see LICENSE for - * more details. - * SPDX-License-Identifier: MIT - */ - -#include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <assert.h> -#if defined (__WIN32) - #include <windows.h> - #include <bcrypt.h> -#elif defined (__APPLE__) - #include "macos_specific_endian.h" -#else - #include <endian.h> -#endif - -#include "picnic_impl.h" -#include "picnic.h" -#include "platform.h" -#include "lowmc_constants.h" -#include "hash.h" -#include "picnic_types.h" -#include "dap_common.h" - - -#define VIEW_OUTPUTS(i, j) viewOutputs[(i) * 3 + (j)] - - -/* Helper functions */ -uint16_t toLittleEndian(uint16_t x) -{ -#if defined(__WIN32) - #if BYTE_ORDER == LITTLE_ENDIAN - return x; - #else - return __builtin_bswap16(x); - #endif -#else - return htole16(x); -#endif -} - -/* Get one bit from a byte array */ -uint8_t getBit(const uint8_t* array, uint32_t bitNumber) -{ - return (array[bitNumber / 8] >> (7 - (bitNumber % 8))) & 0x01; -} - -/* Get one bit from a 32-bit int array */ -uint8_t getBitFromWordArray(const uint32_t* array, uint32_t bitNumber) -{ - return getBit((uint8_t*)array, bitNumber); -} - -/* Set a specific bit in a byte array to a given value */ -void setBit(uint8_t* bytes, uint32_t bitNumber, uint8_t val) -{ - bytes[bitNumber / 8] = (bytes[bitNumber >> 3] - & ~(1 << (7 - (bitNumber % 8)))) | (val << (7 - (bitNumber % 8))); -} - -/* Set a specific bit in a byte array to a given value */ -void setBitInWordArray(uint32_t* array, uint32_t bitNumber, uint8_t val) -{ - setBit((uint8_t*)array, bitNumber, val); -} - -static uint8_t parity(uint32_t* data, size_t len) -{ - uint32_t x = data[0]; - size_t i; - for (i = 1; i < len; i++) { - x ^= data[i]; - } - - /* Compute parity of x using code from Section 5-2 of - * H.S. Warren, *Hacker's Delight*, Pearson Education, 2003. - * http://www.hackersdelight.org/hdcodetxt/parity.c.txt - */ - uint32_t y = x ^ (x >> 1); - y ^= (y >> 2); - y ^= (y >> 4); - y ^= (y >> 8); - y ^= (y >> 16); - return y & 1; -} - -uint32_t numBytes(uint32_t numBits) -{ - return (numBits == 0) ? 0 : ((numBits - 1) / 8 + 1); -} - -static void xor_array(const uint32_t * in1, const uint32_t * in2, uint32_t * out, uint32_t numBytes) -{ - uint32_t i; - for (i = 0; i < numBytes; i++) { - out[i] = in1[i] ^ in2[i]; - } -} - -static void matrix_mul( - uint32_t* state, - const uint32_t* matrix, - uint32_t* output, - paramset_t* params) -{ - // Use temp to correctly handle the case when state = output - uint32_t prod[LOWMC_MAX_STATE_SIZE]; - uint32_t temp[LOWMC_MAX_STATE_SIZE]; - - uint32_t i, j; - for (i = 0; i < params->stateSizeBits; i++) { - for (j = 0; j < params->stateSizeWords; j++) { - size_t index = i * params->stateSizeWords + j; - prod[j] = (state[j] & matrix[index]); - } - setBit((uint8_t*)temp, i, parity(&prod[0], params->stateSizeWords)); - - } - memcpy(output, &temp, params->stateSizeWords * sizeof(uint32_t)); -} - -static void substitution(uint32_t* state, paramset_t* params) -{ - uint32_t i; - for (i = 0; i < params->numSboxes * 3; i += 3) { - uint8_t a = getBitFromWordArray(state, i + 2); - uint8_t b = getBitFromWordArray(state, i + 1); - uint8_t c = getBitFromWordArray(state, i); - - setBitInWordArray(state, i + 2, a ^ (b & c)); - setBitInWordArray(state, i + 1, a ^ b ^ (a & c)); - setBitInWordArray(state, i, a ^ b ^ c ^ (a & b)); - } -} - -void LowMCEnc(const uint32_t* plaintext, uint32_t* output, uint32_t* key, paramset_t* params) -{ - uint32_t roundKey[LOWMC_MAX_STATE_SIZE / sizeof(uint32_t)]; - - if (plaintext != output) { - /* output will hold the intermediate state */ - memcpy(output, plaintext, params->stateSizeBytes); - } - - matrix_mul(key, KMatrix(0, params), roundKey, params); - xor_array(output, roundKey, output, params->stateSizeWords); - - uint32_t r; - for (r = 1; r <= params->numRounds; r++) { - matrix_mul(key, KMatrix(r, params), roundKey, params); - substitution(output, params); - matrix_mul(output, LMatrix(r - 1, params), output, params); - xor_array(output, RConstant(r - 1, params), output, params->stateSizeWords); - xor_array(output, roundKey, output, params->stateSizeWords); - } - -} - -bool createRandomTape(const uint8_t* seed, uint8_t* tape, - uint32_t tapeLengthBytes, paramset_t* params) -{ - HashInstance ctx; - - if (tapeLengthBytes < params->digestSizeBytes) { - return false; - } - - /* Hash the seed and a constant, store the result in tape. */ - HashInit(&ctx, params, HASH_PREFIX_2); - HashUpdate(&ctx, seed, params->seedSizeBytes); - HashFinal(&ctx); - HashSqueeze(&ctx, tape, params->digestSizeBytes); - - /* Expand the hashed seed and output length to create the tape. */ - HashInit(&ctx, params, HASH_PREFIX_NONE); - HashUpdate(&ctx, tape, params->digestSizeBytes); - uint16_t outputBytesLE = toLittleEndian(tapeLengthBytes); - HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t)); - HashFinal(&ctx); - HashSqueeze(&ctx, tape, tapeLengthBytes); - - return true; -} - -void mpc_xor(uint32_t* state[3], uint32_t* in[3], uint32_t len, int players) -{ - uint8_t i; - for (i = 0; i < players; i++) { - xor_array(state[i], in[i], state[i], len); - } -} - -/* Compute the XOR of in with the first state vectors. */ -void mpc_xor_constant(uint32_t* state[3], const uint32_t* in, uint32_t len) -{ - xor_array(state[0], in, state[0], len); -} - -void mpc_xor_constant_verify(uint32_t* state[2], const uint32_t* in, uint32_t len, uint8_t challenge) -{ - /* During verify, where the first share is stored in state depends on the challenge */ - if (challenge == 0) { - xor_array(state[0], in, state[0], len); - } - else if (challenge == 2) { - xor_array(state[1], in, state[1], len); - } -} - - -void Commit(const uint8_t* seed, const view_t view, - uint8_t* hash, paramset_t* params) -{ - HashInstance ctx; - - /* Hash the seed, store result in `hash` */ - HashInit(&ctx, params, HASH_PREFIX_4); - HashUpdate(&ctx, seed, params->seedSizeBytes); - HashFinal(&ctx); - HashSqueeze(&ctx, hash, params->digestSizeBytes); - - /* Compute H_0(H_4(seed), view) */ - HashInit(&ctx, params, HASH_PREFIX_0); - HashUpdate(&ctx, hash, params->digestSizeBytes); - HashUpdate(&ctx, (uint8_t*)view.inputShare, params->stateSizeBytes); - HashUpdate(&ctx, (uint8_t*)view.communicatedBits, params->andSizeBytes); - HashUpdate(&ctx, (uint8_t*)view.outputShare, params->stateSizeBytes); - HashFinal(&ctx); - HashSqueeze(&ctx, hash, params->digestSizeBytes); -} - -/* This is the random "permuatation" function G for Unruh's transform */ -void G(uint8_t viewNumber, const uint8_t* seed, view_t* view, uint8_t* output, paramset_t* params) -{ - HashInstance ctx; - uint16_t outputBytes = params->seedSizeBytes + params->andSizeBytes; - - /* Hash the seed with H_5, store digest in output */ - HashInit(&ctx, params, HASH_PREFIX_5); - HashUpdate(&ctx, seed, params->seedSizeBytes); - HashFinal(&ctx); - HashSqueeze(&ctx, output, params->digestSizeBytes); - - /* Hash H_5(seed), the view, and the length */ - HashInit(&ctx, params, HASH_PREFIX_NONE); - HashUpdate(&ctx, output, params->digestSizeBytes); - if (viewNumber == 2) { - HashUpdate(&ctx, (uint8_t*)view->inputShare, params->stateSizeBytes); - outputBytes += (uint16_t)params->stateSizeBytes; - } - HashUpdate(&ctx, view->communicatedBits, params->andSizeBytes); - - uint16_t outputBytesLE = toLittleEndian(outputBytes); - HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t)); - HashFinal(&ctx); - HashSqueeze(&ctx, output, outputBytes); -} - -void setChallenge(uint8_t* challenge, size_t round, uint8_t trit) -{ - /* challenge must have length numBytes(numZKBRounds*2) - * 0 <= index < numZKBRounds - * trit must be in {0,1,2} */ - uint32_t roundU32 = (uint32_t)round; - - setBit(challenge, 2 * roundU32, trit & 1); - setBit(challenge, 2 * roundU32 + 1, (trit >> 1) & 1); -} - -uint8_t getChallenge(const uint8_t* challenge, size_t round) -{ - uint32_t roundU32 = (uint32_t)round; - - return (getBit(challenge, 2 * roundU32 + 1) << 1) | getBit(challenge, 2 * roundU32); -} - -void H3(const uint32_t* circuitOutput, const uint32_t* plaintext, uint32_t** viewOutputs, - commitments_t* as, - uint8_t* challengeBits, const uint8_t* message, size_t messageByteLength, - g_commitments_t* gs, paramset_t* params) -{ - uint8_t* hash = malloc(params->digestSizeBytes); - - HashInstance ctx; - - /* Depending on the number of rounds, we might not set part of the last - * byte, make sure it's always zero. */ - challengeBits[numBytes(params->numZKBRounds * 2) - 1] = 0; - - /* Hash input data */ - HashInit(&ctx, params, HASH_PREFIX_1); - - /* Hash the output share from each view */ - uint32_t i; - int j; - for (i = 0; i < params->numZKBRounds; i++) { - for (j = 0; j < 3; j++) { - HashUpdate(&ctx, (uint8_t*)VIEW_OUTPUTS(i, j), params->stateSizeBytes); - } - } - - /* Hash all the commitments C */ - for (i = 0; i < params->numZKBRounds; i++) { - for (j = 0; j < 3; j++) { - HashUpdate(&ctx, as[i].hashes[j], params->digestSizeBytes); - } - } - - /* Hash all the commitments G */ - if (params->transform == TRANSFORM_UR) { - for (i = 0; i < params->numZKBRounds; i++) { - for (j = 0; j < 3; j++) { - size_t view3UnruhLength = (j == 2) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; - HashUpdate(&ctx, gs[i].G[j], view3UnruhLength); - } - } - } - - HashUpdate(&ctx, (uint8_t*)circuitOutput, params->stateSizeBytes); - HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes); - HashUpdate(&ctx, message, messageByteLength); - - HashFinal(&ctx); - HashSqueeze(&ctx, hash, params->digestSizeBytes); - - /* Convert hash to a packed string of values in {0,1,2} */ - size_t byte_count, round = 0; - while (1) { - for (byte_count = 0; byte_count < params->digestSizeBytes; byte_count++) { - uint8_t byte = hash[byte_count]; - /* iterate over each pair of bits in the byte */ - for (j = 0; j < 8; j += 2) { - uint8_t bitPair = ((byte >> (6 - j)) & 0x03); - if (bitPair < 3) { - setChallenge(challengeBits, round, bitPair); - round++; - if (round == params->numZKBRounds) { - goto done; - } - } - } - } - - /* We need more bits; hash set hash = H_1(hash) */ - HashInit(&ctx, params, HASH_PREFIX_1); - HashUpdate(&ctx, hash, params->digestSizeBytes); - HashFinal(&ctx); - HashSqueeze(&ctx, hash, params->digestSizeBytes); - } - -done: - - free(hash); - return; -} - -/* Caller must allocate the first parameter */ -void prove(proof_t* proof, uint8_t challenge, seeds_t* seeds, - view_t views[3], commitments_t* commitments, g_commitments_t* gs, paramset_t* params) -{ - if (challenge == 0) { - memcpy(proof->seed1, seeds->seed0, params->seedSizeBytes); - memcpy(proof->seed2, seeds->seed1, params->seedSizeBytes); - } - else if (challenge == 1) { - memcpy(proof->seed1, seeds->seed1, params->seedSizeBytes); - memcpy(proof->seed2, seeds->seed2, params->seedSizeBytes); - } - else if (challenge == 2) { - memcpy(proof->seed1, seeds->seed2, params->seedSizeBytes); - memcpy(proof->seed2, seeds->seed0, params->seedSizeBytes); - } - else { - assert(!"Invalid challenge"); - } - - if (challenge == 1 || challenge == 2) { - memcpy(proof->inputShare, views[2].inputShare, params->stateSizeBytes); - } - memcpy(proof->communicatedBits, views[(challenge + 1) % 3].communicatedBits, params->andSizeBytes); - - memcpy(proof->view3Commitment, commitments->hashes[(challenge + 2) % 3], params->digestSizeBytes); - if (params->transform == TRANSFORM_UR) { - size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; - memcpy(proof->view3UnruhG, gs->G[(challenge + 2) % 3], view3UnruhLength); - } -} - -void mpc_AND_verify(uint8_t in1[2], uint8_t in2[2], uint8_t out[2], - randomTape_t* rand, view_t* view1, view_t* view2) -{ - uint8_t r[2] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos) }; - - out[0] = (in1[0] & in2[1]) ^ (in1[1] & in2[0]) ^ (in1[0] & in2[0]) ^ r[0] ^ r[1]; - setBit(view1->communicatedBits, rand->pos, out[0]); - out[1] = getBit(view2->communicatedBits, rand->pos); - - (rand->pos)++; -} - -void mpc_substitution_verify(uint32_t* state[2], randomTape_t* rand, view_t* view1, - view_t* view2, paramset_t* params) -{ - uint32_t i; - for (i = 0; i < params->numSboxes * 3; i += 3) { - - uint8_t a[2]; - uint8_t b[2]; - uint8_t c[2]; - - uint8_t j; - for (j = 0; j < 2; j++) { - a[j] = getBitFromWordArray(state[j], i + 2); - b[j] = getBitFromWordArray(state[j], i + 1); - c[j] = getBitFromWordArray(state[j], i); - } - - uint8_t ab[2]; - uint8_t bc[2]; - uint8_t ca[2]; - - mpc_AND_verify(a, b, ab, rand, view1, view2); - mpc_AND_verify(b, c, bc, rand, view1, view2); - mpc_AND_verify(c, a, ca, rand, view1, view2); - - for (j = 0; j < 2; j++) { - setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j])); - setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j])); - setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j])); - } - } -} - -void mpc_matrix_mul(uint32_t* state[3], const uint32_t* matrix, - uint32_t* output[3], paramset_t* params, size_t players) -{ - uint32_t player; - for (player = 0; player < players; player++) { - matrix_mul(state[player], matrix, output[player], params); - } -} - -void mpc_LowMC_verify(view_t* view1, view_t* view2, - randomTape_t* tapes, uint32_t* tmp, - const uint32_t* plaintext, paramset_t* params, uint8_t challenge) -{ - uint32_t* state[2]; - uint32_t* keyShares[2]; - uint32_t* roundKey[2]; - - roundKey[0] = tmp; - roundKey[1] = roundKey[0] + params->stateSizeWords; - state[0] = roundKey[1] + params->stateSizeWords; - state[1] = state[0] + params->stateSizeWords; - - // initialize both roundkeys to 0. they are contingent - memset(roundKey[0], 0, 2 * params->stateSizeBytes); - - uint32_t i, r; - for (i = 0; i < 2; i++) { - memset(state[i], 0x00, params->stateSizeBytes); - } - mpc_xor_constant_verify(state, plaintext, params->stateSizeWords, challenge); - - keyShares[0] = view1->inputShare; - keyShares[1] = view2->inputShare; - - mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 2); - mpc_xor(state, roundKey, params->stateSizeWords, 2); - - for (r = 1; r <= params->numRounds; ++r) { - mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 2); - mpc_substitution_verify(state, tapes, view1, view2, params); - mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 2); - mpc_xor_constant_verify(state, RConstant(r - 1, params), params->stateSizeWords, challenge); - mpc_xor(state, roundKey, params->stateSizeWords, 2); - } - - memcpy(view1->outputShare, state[0], params->stateSizeBytes); - memcpy(view2->outputShare, state[1], params->stateSizeBytes); -} - -void verifyProof(const proof_t* proof, view_t* view1, view_t* view2, - uint8_t challenge, uint8_t* tmp, - const uint32_t* plaintext, randomTape_t* tape, paramset_t* params) -{ - memcpy(view2->communicatedBits, proof->communicatedBits, params->andSizeBytes); - tape->pos = 0; - - bool status = false; - switch (challenge) { - case 0: - // in this case, both views' inputs are derivable from the input share - - status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params); - memcpy(view1->inputShare, tmp, params->stateSizeBytes); - memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes); - status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params); - if (!status) { - break; - } - memcpy(view2->inputShare, tmp, params->stateSizeBytes); - memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes); - break; - - case 1: - // in this case view2's input share was already given to us explicitly as - // it is not computable from the seed. We just need to compute view1's input from - // its seed - status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params); - memcpy(view1->inputShare, tmp, params->stateSizeBytes); - memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes); - status = status && createRandomTape(proof->seed2, tape->tape[1], params->andSizeBytes, params); - if (!status) { - break; - } - memcpy(view2->inputShare, proof->inputShare, params->stateSizeBytes); - break; - - case 2: - // in this case view1's input share was already given to us explicitly as - // it is not computable from the seed. We just need to compute view2's input from - // its seed - status = createRandomTape(proof->seed1, tape->tape[0], params->andSizeBytes, params); - memcpy(view1->inputShare, proof->inputShare, params->stateSizeBytes); - status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params); - if (!status) { - break; - } - memcpy(view2->inputShare, tmp, params->stateSizeBytes); - memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes); - break; - - default: - break; - } - - mpc_LowMC_verify(view1, view2, tape, (uint32_t*)tmp, plaintext, params, challenge); -} - -int verify(signature_t* sig, const uint32_t* pubKey, const uint32_t* plaintext, - const uint8_t* message, size_t messageByteLength, paramset_t* params) -{ - commitments_t* as = allocateCommitments(params); - g_commitments_t* gs = allocateGCommitments(params); - - uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*)); - const proof_t* proofs = sig->proofs; - - const uint8_t* received_challengebits = sig->challengeBits; - int status = EXIT_SUCCESS; - uint8_t* computed_challengebits = NULL; - uint32_t* view3Slab = NULL; - - uint8_t* tmp = malloc(MAX(6 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes)); - - randomTape_t* tape = (randomTape_t*)malloc(sizeof(randomTape_t)); - - allocateRandomTape(tape, params); - - view_t* view1s = malloc(params->numZKBRounds * sizeof(view_t)); - view_t* view2s = malloc(params->numZKBRounds * sizeof(view_t)); - - /* Allocate a slab of memory for the 3rd view's output in each round */ - view3Slab = malloc(params->stateSizeBytes * params->numZKBRounds); - uint32_t* view3Output = view3Slab; /* pointer into the slab to the current 3rd view */ - - size_t i, j; - for (i = 0; i < params->numZKBRounds; i++) { - allocateView(&view1s[i], params); - allocateView(&view2s[i], params); - - // last bits of communicatedBits may not be set so zero them - view1s[i].communicatedBits[params->andSizeBytes - 1] = 0; - - verifyProof(&proofs[i], &view1s[i], &view2s[i], - getChallenge(received_challengebits, i), - tmp, plaintext, tape, params); - - // create ordered array of commitments with order computed based on the challenge - // check commitments of the two opened views - uint8_t challenge = getChallenge(received_challengebits, i); - Commit(proofs[i].seed1, view1s[i], as[i].hashes[challenge], params); - Commit(proofs[i].seed2, view2s[i], as[i].hashes[(challenge + 1) % 3], params); - memcpy(as[i].hashes[(challenge + 2) % 3], proofs[i].view3Commitment, params->digestSizeBytes); - - if (params->transform == TRANSFORM_UR) { - G(challenge, proofs[i].seed1, &view1s[i], gs[i].G[challenge], params); - G((challenge + 1) % 3, proofs[i].seed2, &view2s[i], gs[i].G[(challenge + 1) % 3], params); - size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; - memcpy(gs[i].G[(challenge + 2) % 3], proofs[i].view3UnruhG, view3UnruhLength); - } - - VIEW_OUTPUTS(i, challenge) = view1s[i].outputShare; - VIEW_OUTPUTS(i, (challenge + 1) % 3) = view2s[i].outputShare; - for (j = 0; j < params->stateSizeWords; j++) { - view3Output[j] = view1s[i].outputShare[j] ^ view2s[i].outputShare[j] - ^ pubKey[j]; - } - VIEW_OUTPUTS(i, (challenge + 2) % 3) = view3Output; - view3Output += params->stateSizeWords; - } - - computed_challengebits = malloc(numBytes(2 * params->numZKBRounds)); - - H3(pubKey, plaintext, viewOutputs, as, - computed_challengebits, message, messageByteLength, gs, params); - - if (computed_challengebits != NULL && - memcmp(received_challengebits, computed_challengebits, - numBytes(2 * params->numZKBRounds)) != 0) { - status = EXIT_FAILURE; - } - - free(computed_challengebits); - free(view3Slab); - - freeCommitments(as); - for (i = 0; i < params->numZKBRounds; i++) { - freeView(&view1s[i]); - freeView(&view2s[i]); - } - free(view1s); - free(view2s); - free(tmp); - freeRandomTape(tape); - free(tape); - freeGCommitments(gs); - free(viewOutputs); - - return status; -} - -/*** Functions implementing Sign ***/ - -void mpc_AND(uint8_t in1[3], uint8_t in2[3], uint8_t out[3], randomTape_t* rand, - view_t views[3]) -{ - uint8_t r[3] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos), getBit(rand->tape[2], rand->pos) }; - - uint8_t i; - for (i = 0; i < 3; i++) { - out[i] = (in1[i] & in2[(i + 1) % 3]) ^ (in1[(i + 1) % 3] & in2[i]) - ^ (in1[i] & in2[i]) ^ r[i] ^ r[(i + 1) % 3]; - - setBit(views[i].communicatedBits, rand->pos, out[i]); - } - - (rand->pos)++; -} - -void mpc_substitution(uint32_t* state[3], randomTape_t* rand, view_t views[3], - paramset_t* params) -{ - uint8_t a[3]; - uint8_t b[3]; - uint8_t c[3]; - - uint8_t ab[3]; - uint8_t bc[3]; - uint8_t ca[3]; - - uint32_t i; - for (i = 0; i < params->numSboxes * 3; i += 3) { - - uint8_t j; - for (j = 0; j < 3; j++) { - a[j] = getBitFromWordArray(state[j], i + 2); - b[j] = getBitFromWordArray(state[j], i + 1); - c[j] = getBitFromWordArray(state[j], i); - } - - mpc_AND(a, b, ab, rand, views); - mpc_AND(b, c, bc, rand, views); - mpc_AND(c, a, ca, rand, views); - - for (j = 0; j < 3; j++) { - setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j])); - setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j])); - setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j])); - } - } -} - -void mpc_LowMC(randomTape_t* tapes, view_t views[3], - const uint32_t* plaintext, uint32_t* slab, paramset_t* params) -{ - uint32_t* keyShares[3]; - uint32_t* state[3]; - uint32_t* roundKey[3]; - - roundKey[0] = slab; - roundKey[1] = slab + params->stateSizeWords; - roundKey[2] = roundKey[1] + params->stateSizeWords; - state[0] = roundKey[2] + params->stateSizeWords; - state[1] = state[0] + params->stateSizeWords; - state[2] = state[1] + params->stateSizeWords; - - memset(roundKey[0], 0, 3 * params->stateSizeBytes); - int i; - for (i = 0; i < 3; i++) { - keyShares[i] = views[i].inputShare; - memset(state[i], 0x00, params->stateSizeBytes); - } - mpc_xor_constant(state, plaintext, params->stateSizeWords); - - mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 3); - mpc_xor(state, roundKey, params->stateSizeWords, 3); - - uint32_t r; - for (r = 1; r <= params->numRounds; r++) { - mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 3); - mpc_substitution(state, tapes, views, params); - mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 3); - mpc_xor_constant(state, RConstant(r - 1, params), params->stateSizeWords); - mpc_xor(state, roundKey, params->stateSizeWords, 3); - } - - for (i = 0; i < 3; i++) { - memcpy(views[i].outputShare, state[i], params->stateSizeBytes); - } - -} - -void runMPC(view_t views[3], randomTape_t* rand, - uint32_t* plaintext, uint32_t* slab, paramset_t* params) -{ - rand->pos = 0; - mpc_LowMC(rand, views, plaintext, slab, params); -} - - -seeds_t* computeSeeds(uint32_t* privateKey, uint32_t* - publicKey, uint32_t* plaintext, const uint8_t* message, size_t messageByteLength, paramset_t* params) -{ - HashInstance ctx; - seeds_t* allSeeds = allocateSeeds(params); - - HashInit(&ctx, params, HASH_PREFIX_NONE); - HashUpdate(&ctx, (uint8_t*)privateKey, params->stateSizeBytes); - HashUpdate(&ctx, message, messageByteLength); - HashUpdate(&ctx, (uint8_t*)publicKey, params->stateSizeBytes); - HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes); - uint16_t stateSizeBitsLE = toLittleEndian((uint16_t)params->stateSizeBits); - HashUpdate(&ctx, ((uint8_t*)&stateSizeBitsLE), sizeof(uint16_t)); - HashFinal(&ctx); - - HashSqueeze(&ctx, getSeed(allSeeds, 0, 0), params->seedSizeBytes * 3 * params->numZKBRounds); - - return allSeeds; -} - -int sign(uint32_t* privateKey, uint32_t* pubKey, uint32_t* plaintext, const uint8_t* message, - size_t messageByteLength, signature_t* sig, paramset_t* params) -{ - bool status; - - /* Allocate views and commitments for all parallel iterations */ - view_t** views = allocateViews(params); - commitments_t* as = allocateCommitments(params); - g_commitments_t* gs = allocateGCommitments(params); - - /* Compute seeds for all parallel iterations */ - seeds_t* seeds = computeSeeds(privateKey, pubKey, plaintext, message, messageByteLength, params); - - //Allocate a random tape (re-used per parallel iteration), and a temporary buffer - randomTape_t tape; - - allocateRandomTape(&tape, params); - uint8_t* tmp = malloc( MAX(9 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes)); - - uint32_t k; - for (k = 0; k < params->numZKBRounds; k++) { - // for first two players get all tape INCLUDING INPUT SHARE from seed - int j; - for (j = 0; j < 2; j++) { - status = createRandomTape(getSeed(seeds, k, j), tmp, params->stateSizeBytes + params->andSizeBytes, params); - if (!status) { - return EXIT_FAILURE; - } - - memcpy(views[k][j].inputShare, tmp, params->stateSizeBytes); - memcpy(tape.tape[j], tmp + params->stateSizeBytes, params->andSizeBytes); - } - // Now set third party's wires. The random bits are from the seed, the input is - // the XOR of other two inputs and the private key - status = createRandomTape(getSeed(seeds, k, 2), tape.tape[2], params->andSizeBytes, params); - if (!status) { - return EXIT_FAILURE; - } - uint32_t j1; - for (j1 = 0; j1 < params->stateSizeWords; j1++) { - views[k][2].inputShare[j1] = privateKey[j1] - ^ views[k][0].inputShare[j1] - ^ views[k][1].inputShare[j1]; - } - - runMPC(views[k], &tape, plaintext, (uint32_t*)tmp, params); - - //Committing - Commit(getSeed(seeds, k, 0), views[k][0], as[k].hashes[0], params); - Commit(getSeed(seeds, k, 1), views[k][1], as[k].hashes[1], params); - Commit(getSeed(seeds, k, 2), views[k][2], as[k].hashes[2], params); - - if (params->transform == TRANSFORM_UR) { - G(0, getSeed(seeds, k, 0), &views[k][0], gs[k].G[0], params); - G(1, getSeed(seeds, k, 1), &views[k][1], gs[k].G[1], params); - G(2, getSeed(seeds, k, 2), &views[k][2], gs[k].G[2], params); - } - } - - //Generating challenges - uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*)); - - size_t ii, jj; - for (ii = 0; ii < params->numZKBRounds; ii++) - for (jj = 0; jj < 3; jj++) - VIEW_OUTPUTS(ii, jj) = views[ii][jj].outputShare; - - - uint32_t output[LOWMC_MAX_STATE_SIZE]; - uint32_t j; - for (j = 0; j < params->stateSizeWords; j++) - output[j] = (VIEW_OUTPUTS(0, 0))[j] ^ (VIEW_OUTPUTS(0, 1))[j] ^ (VIEW_OUTPUTS(0, 2))[j]; - - - H3(output, plaintext, viewOutputs, as, - sig->challengeBits, message, messageByteLength, gs, params); - - //Packing Z - size_t i; - for (i = 0; i < params->numZKBRounds; i++) { - proof_t* proof = &sig->proofs[i]; - prove(proof, getChallenge(sig->challengeBits, i), &seeds[i], - views[i], &as[i], (gs == NULL) ? NULL : &gs[i], params); - } - - free(tmp); - - freeViews(views, params); - freeCommitments(as); - freeRandomTape(&tape); - freeGCommitments(gs); - free(viewOutputs); - freeSeeds(seeds); - - return EXIT_SUCCESS; -} - -/*** Serialization functions ***/ - -int serializeSignature(const signature_t* sig, uint8_t* sigBytes, size_t sigBytesLen, paramset_t* params) -{ - const proof_t* proofs = sig->proofs; - const uint8_t* challengeBits = sig->challengeBits; - - /* Validate input buffer is large enough */ - size_t bytesRequired = numBytes(2 * params->numZKBRounds) + - params->numZKBRounds * (2 * params->seedSizeBytes + params->stateSizeBytes + params->andSizeBytes + params->digestSizeBytes); - - if (params->transform == TRANSFORM_UR) { - bytesRequired += params->UnruhGWithoutInputBytes * params->numZKBRounds; - } - - if (sigBytesLen < bytesRequired) { - return -1; - } - - uint8_t* sigBytesBase = sigBytes; - - memcpy(sigBytes, challengeBits, numBytes(2 * params->numZKBRounds)); - sigBytes += numBytes(2 * params->numZKBRounds); - - size_t i; - for (i = 0; i < params->numZKBRounds; i++) { - - uint8_t challenge = getChallenge(challengeBits, i); - - memcpy(sigBytes, proofs[i].view3Commitment, params->digestSizeBytes); - sigBytes += params->digestSizeBytes; - - if (params->transform == TRANSFORM_UR) { - size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; - memcpy(sigBytes, proofs[i].view3UnruhG, view3UnruhLength); - sigBytes += view3UnruhLength; - } - - memcpy(sigBytes, proofs[i].communicatedBits, params->andSizeBytes); - sigBytes += params->andSizeBytes; - - memcpy(sigBytes, proofs[i].seed1, params->seedSizeBytes); - sigBytes += params->seedSizeBytes; - - memcpy(sigBytes, proofs[i].seed2, params->seedSizeBytes); - sigBytes += params->seedSizeBytes; - - if (challenge == 1 || challenge == 2) { - memcpy(sigBytes, proofs[i].inputShare, params->stateSizeBytes); - sigBytes += params->stateSizeBytes; - } - - - } - - return (int)(sigBytes - sigBytesBase); -} - - -static size_t computeInputShareSize(const uint8_t* challengeBits, size_t stateSizeBytes, paramset_t* params) -{ - /* When the FS transform is used, the input share is included in the proof - * only when the challenge is 1 or 2. When dersializing, to compute the - * number of bytes expected, we must check how many challenge values are 1 - * or 2. The parameter stateSizeBytes is the size of an input share. */ - size_t inputShareSize = 0; - - size_t i; - for (i = 0; i < params->numZKBRounds; i++) { - uint8_t challenge = getChallenge(challengeBits, i); - if (challenge == 1 || challenge == 2) { - inputShareSize += stateSizeBytes; - } - } - return inputShareSize; -} - -int deserializeSignature(signature_t* sig, const uint8_t* sigBytes, - size_t sigBytesLen, paramset_t* params) -{ - proof_t* proofs = sig->proofs; - uint8_t* challengeBits = sig->challengeBits; - - /* Validate input buffer is large enough */ - if (sigBytesLen < numBytes(2 * params->numZKBRounds)) { /* ensure the input has at least the challenge */ - return EXIT_FAILURE; - } - size_t inputShareSize = computeInputShareSize(sigBytes, params->stateSizeBytes, params); - size_t bytesExpected = numBytes(2 * params->numZKBRounds) + - params->numZKBRounds * (2 * params->seedSizeBytes + params->andSizeBytes + params->digestSizeBytes) + inputShareSize; - if (params->transform == TRANSFORM_UR) { - bytesExpected += params->UnruhGWithoutInputBytes * params->numZKBRounds; - } - if (sigBytesLen < bytesExpected) { - return EXIT_FAILURE; - } - - memcpy(challengeBits, sigBytes, numBytes(2 * params->numZKBRounds)); - sigBytes += numBytes(2 * params->numZKBRounds); - - size_t i; - for (i = 0; i < params->numZKBRounds; i++) { - - uint8_t challenge = getChallenge(challengeBits, i); - - memcpy(proofs[i].view3Commitment, sigBytes, params->digestSizeBytes); - sigBytes += params->digestSizeBytes; - - if (params->transform == TRANSFORM_UR) { - size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; - memcpy(proofs[i].view3UnruhG, sigBytes, view3UnruhLength); - sigBytes += view3UnruhLength; - } - - memcpy(proofs[i].communicatedBits, sigBytes, params->andSizeBytes); - sigBytes += params->andSizeBytes; - - memcpy(proofs[i].seed1, sigBytes, params->seedSizeBytes); - sigBytes += params->seedSizeBytes; - - memcpy(proofs[i].seed2, sigBytes, params->seedSizeBytes); - sigBytes += params->seedSizeBytes; - - if (challenge == 1 || challenge == 2) { - memcpy(proofs[i].inputShare, sigBytes, params->stateSizeBytes); - sigBytes += params->stateSizeBytes; - } - - } - - return EXIT_SUCCESS; -} - - - - +/*! @file picnic_impl.c + * @brief This is the main file of the signature scheme. All of the LowMC MPC + * code is here as well as lower-level versions of sign and verify that are + * called by the signature API. + * + * This file is part of the reference implementation of the Picnic signature scheme. + * See the accompanying documentation for complete details. + * + * The code is provided under the MIT license, see LICENSE for + * more details. + * SPDX-License-Identifier: MIT + */ + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#if defined (__WIN32) + #include <windows.h> + #include <bcrypt.h> +#elif defined (__APPLE__) + #include "macos_specific_endian.h" +#elif defined (DAP_OS_LINUX) + #include <endian.h> +#endif + +#include "picnic_impl.h" +#include "picnic.h" +#include "platform.h" +#include "lowmc_constants.h" +#include "hash.h" +#include "picnic_types.h" +#include "dap_common.h" + + +#define VIEW_OUTPUTS(i, j) viewOutputs[(i) * 3 + (j)] + + +/* Helper functions */ +uint16_t toLittleEndian(uint16_t x) +{ +#if defined(__WIN32) + #if BYTE_ORDER == LITTLE_ENDIAN + return x; + #else + return __builtin_bswap16(x); + #endif +#else + return htole16(x); +#endif +} + +/* Get one bit from a byte array */ +uint8_t getBit(const uint8_t* array, uint32_t bitNumber) +{ + return (array[bitNumber / 8] >> (7 - (bitNumber % 8))) & 0x01; +} + +/* Get one bit from a 32-bit int array */ +uint8_t getBitFromWordArray(const uint32_t* array, uint32_t bitNumber) +{ + return getBit((uint8_t*)array, bitNumber); +} + +/* Set a specific bit in a byte array to a given value */ +void setBit(uint8_t* bytes, uint32_t bitNumber, uint8_t val) +{ + bytes[bitNumber / 8] = (bytes[bitNumber >> 3] + & ~(1 << (7 - (bitNumber % 8)))) | (val << (7 - (bitNumber % 8))); +} + +/* Set a specific bit in a byte array to a given value */ +void setBitInWordArray(uint32_t* array, uint32_t bitNumber, uint8_t val) +{ + setBit((uint8_t*)array, bitNumber, val); +} + +static uint8_t parity(uint32_t* data, size_t len) +{ + uint32_t x = data[0]; + size_t i; + for (i = 1; i < len; i++) { + x ^= data[i]; + } + + /* Compute parity of x using code from Section 5-2 of + * H.S. Warren, *Hacker's Delight*, Pearson Education, 2003. + * http://www.hackersdelight.org/hdcodetxt/parity.c.txt + */ + uint32_t y = x ^ (x >> 1); + y ^= (y >> 2); + y ^= (y >> 4); + y ^= (y >> 8); + y ^= (y >> 16); + return y & 1; +} + +uint32_t numBytes(uint32_t numBits) +{ + return (numBits == 0) ? 0 : ((numBits - 1) / 8 + 1); +} + +static void xor_array(const uint32_t * in1, const uint32_t * in2, uint32_t * out, uint32_t numBytes) +{ + uint32_t i; + for (i = 0; i < numBytes; i++) { + out[i] = in1[i] ^ in2[i]; + } +} + +static void matrix_mul( + uint32_t* state, + const uint32_t* matrix, + uint32_t* output, + paramset_t* params) +{ + // Use temp to correctly handle the case when state = output + uint32_t prod[LOWMC_MAX_STATE_SIZE]; + uint32_t temp[LOWMC_MAX_STATE_SIZE]; + + uint32_t i, j; + for (i = 0; i < params->stateSizeBits; i++) { + for (j = 0; j < params->stateSizeWords; j++) { + size_t index = i * params->stateSizeWords + j; + prod[j] = (state[j] & matrix[index]); + } + setBit((uint8_t*)temp, i, parity(&prod[0], params->stateSizeWords)); + + } + memcpy(output, &temp, params->stateSizeWords * sizeof(uint32_t)); +} + +static void substitution(uint32_t* state, paramset_t* params) +{ + uint32_t i; + for (i = 0; i < params->numSboxes * 3; i += 3) { + uint8_t a = getBitFromWordArray(state, i + 2); + uint8_t b = getBitFromWordArray(state, i + 1); + uint8_t c = getBitFromWordArray(state, i); + + setBitInWordArray(state, i + 2, a ^ (b & c)); + setBitInWordArray(state, i + 1, a ^ b ^ (a & c)); + setBitInWordArray(state, i, a ^ b ^ c ^ (a & b)); + } +} + +void LowMCEnc(const uint32_t* plaintext, uint32_t* output, uint32_t* key, paramset_t* params) +{ + uint32_t roundKey[LOWMC_MAX_STATE_SIZE / sizeof(uint32_t)]; + + if (plaintext != output) { + /* output will hold the intermediate state */ + memcpy(output, plaintext, params->stateSizeBytes); + } + + matrix_mul(key, KMatrix(0, params), roundKey, params); + xor_array(output, roundKey, output, params->stateSizeWords); + + uint32_t r; + for (r = 1; r <= params->numRounds; r++) { + matrix_mul(key, KMatrix(r, params), roundKey, params); + substitution(output, params); + matrix_mul(output, LMatrix(r - 1, params), output, params); + xor_array(output, RConstant(r - 1, params), output, params->stateSizeWords); + xor_array(output, roundKey, output, params->stateSizeWords); + } + +} + +bool createRandomTape(const uint8_t* seed, uint8_t* tape, + uint32_t tapeLengthBytes, paramset_t* params) +{ + HashInstance ctx; + + if (tapeLengthBytes < params->digestSizeBytes) { + return false; + } + + /* Hash the seed and a constant, store the result in tape. */ + HashInit(&ctx, params, HASH_PREFIX_2); + HashUpdate(&ctx, seed, params->seedSizeBytes); + HashFinal(&ctx); + HashSqueeze(&ctx, tape, params->digestSizeBytes); + + /* Expand the hashed seed and output length to create the tape. */ + HashInit(&ctx, params, HASH_PREFIX_NONE); + HashUpdate(&ctx, tape, params->digestSizeBytes); + uint16_t outputBytesLE = toLittleEndian(tapeLengthBytes); + HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t)); + HashFinal(&ctx); + HashSqueeze(&ctx, tape, tapeLengthBytes); + + return true; +} + +void mpc_xor(uint32_t* state[3], uint32_t* in[3], uint32_t len, int players) +{ + uint8_t i; + for (i = 0; i < players; i++) { + xor_array(state[i], in[i], state[i], len); + } +} + +/* Compute the XOR of in with the first state vectors. */ +void mpc_xor_constant(uint32_t* state[3], const uint32_t* in, uint32_t len) +{ + xor_array(state[0], in, state[0], len); +} + +void mpc_xor_constant_verify(uint32_t* state[2], const uint32_t* in, uint32_t len, uint8_t challenge) +{ + /* During verify, where the first share is stored in state depends on the challenge */ + if (challenge == 0) { + xor_array(state[0], in, state[0], len); + } + else if (challenge == 2) { + xor_array(state[1], in, state[1], len); + } +} + + +void Commit(const uint8_t* seed, const view_t view, + uint8_t* hash, paramset_t* params) +{ + HashInstance ctx; + + /* Hash the seed, store result in `hash` */ + HashInit(&ctx, params, HASH_PREFIX_4); + HashUpdate(&ctx, seed, params->seedSizeBytes); + HashFinal(&ctx); + HashSqueeze(&ctx, hash, params->digestSizeBytes); + + /* Compute H_0(H_4(seed), view) */ + HashInit(&ctx, params, HASH_PREFIX_0); + HashUpdate(&ctx, hash, params->digestSizeBytes); + HashUpdate(&ctx, (uint8_t*)view.inputShare, params->stateSizeBytes); + HashUpdate(&ctx, (uint8_t*)view.communicatedBits, params->andSizeBytes); + HashUpdate(&ctx, (uint8_t*)view.outputShare, params->stateSizeBytes); + HashFinal(&ctx); + HashSqueeze(&ctx, hash, params->digestSizeBytes); +} + +/* This is the random "permuatation" function G for Unruh's transform */ +void G(uint8_t viewNumber, const uint8_t* seed, view_t* view, uint8_t* output, paramset_t* params) +{ + HashInstance ctx; + uint16_t outputBytes = params->seedSizeBytes + params->andSizeBytes; + + /* Hash the seed with H_5, store digest in output */ + HashInit(&ctx, params, HASH_PREFIX_5); + HashUpdate(&ctx, seed, params->seedSizeBytes); + HashFinal(&ctx); + HashSqueeze(&ctx, output, params->digestSizeBytes); + + /* Hash H_5(seed), the view, and the length */ + HashInit(&ctx, params, HASH_PREFIX_NONE); + HashUpdate(&ctx, output, params->digestSizeBytes); + if (viewNumber == 2) { + HashUpdate(&ctx, (uint8_t*)view->inputShare, params->stateSizeBytes); + outputBytes += (uint16_t)params->stateSizeBytes; + } + HashUpdate(&ctx, view->communicatedBits, params->andSizeBytes); + + uint16_t outputBytesLE = toLittleEndian(outputBytes); + HashUpdate(&ctx, (uint8_t*)&outputBytesLE, sizeof(uint16_t)); + HashFinal(&ctx); + HashSqueeze(&ctx, output, outputBytes); +} + +void setChallenge(uint8_t* challenge, size_t round, uint8_t trit) +{ + /* challenge must have length numBytes(numZKBRounds*2) + * 0 <= index < numZKBRounds + * trit must be in {0,1,2} */ + uint32_t roundU32 = (uint32_t)round; + + setBit(challenge, 2 * roundU32, trit & 1); + setBit(challenge, 2 * roundU32 + 1, (trit >> 1) & 1); +} + +uint8_t getChallenge(const uint8_t* challenge, size_t round) +{ + uint32_t roundU32 = (uint32_t)round; + + return (getBit(challenge, 2 * roundU32 + 1) << 1) | getBit(challenge, 2 * roundU32); +} + +void H3(const uint32_t* circuitOutput, const uint32_t* plaintext, uint32_t** viewOutputs, + commitments_t* as, + uint8_t* challengeBits, const uint8_t* message, size_t messageByteLength, + g_commitments_t* gs, paramset_t* params) +{ + uint8_t* hash = malloc(params->digestSizeBytes); + + HashInstance ctx; + + /* Depending on the number of rounds, we might not set part of the last + * byte, make sure it's always zero. */ + challengeBits[numBytes(params->numZKBRounds * 2) - 1] = 0; + + /* Hash input data */ + HashInit(&ctx, params, HASH_PREFIX_1); + + /* Hash the output share from each view */ + uint32_t i; + int j; + for (i = 0; i < params->numZKBRounds; i++) { + for (j = 0; j < 3; j++) { + HashUpdate(&ctx, (uint8_t*)VIEW_OUTPUTS(i, j), params->stateSizeBytes); + } + } + + /* Hash all the commitments C */ + for (i = 0; i < params->numZKBRounds; i++) { + for (j = 0; j < 3; j++) { + HashUpdate(&ctx, as[i].hashes[j], params->digestSizeBytes); + } + } + + /* Hash all the commitments G */ + if (params->transform == TRANSFORM_UR) { + for (i = 0; i < params->numZKBRounds; i++) { + for (j = 0; j < 3; j++) { + size_t view3UnruhLength = (j == 2) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; + HashUpdate(&ctx, gs[i].G[j], view3UnruhLength); + } + } + } + + HashUpdate(&ctx, (uint8_t*)circuitOutput, params->stateSizeBytes); + HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes); + HashUpdate(&ctx, message, messageByteLength); + + HashFinal(&ctx); + HashSqueeze(&ctx, hash, params->digestSizeBytes); + + /* Convert hash to a packed string of values in {0,1,2} */ + size_t byte_count, round = 0; + while (1) { + for (byte_count = 0; byte_count < params->digestSizeBytes; byte_count++) { + uint8_t byte = hash[byte_count]; + /* iterate over each pair of bits in the byte */ + for (j = 0; j < 8; j += 2) { + uint8_t bitPair = ((byte >> (6 - j)) & 0x03); + if (bitPair < 3) { + setChallenge(challengeBits, round, bitPair); + round++; + if (round == params->numZKBRounds) { + goto done; + } + } + } + } + + /* We need more bits; hash set hash = H_1(hash) */ + HashInit(&ctx, params, HASH_PREFIX_1); + HashUpdate(&ctx, hash, params->digestSizeBytes); + HashFinal(&ctx); + HashSqueeze(&ctx, hash, params->digestSizeBytes); + } + +done: + + free(hash); + return; +} + +/* Caller must allocate the first parameter */ +void prove(proof_t* proof, uint8_t challenge, seeds_t* seeds, + view_t views[3], commitments_t* commitments, g_commitments_t* gs, paramset_t* params) +{ + if (challenge == 0) { + memcpy(proof->seed1, seeds->seed0, params->seedSizeBytes); + memcpy(proof->seed2, seeds->seed1, params->seedSizeBytes); + } + else if (challenge == 1) { + memcpy(proof->seed1, seeds->seed1, params->seedSizeBytes); + memcpy(proof->seed2, seeds->seed2, params->seedSizeBytes); + } + else if (challenge == 2) { + memcpy(proof->seed1, seeds->seed2, params->seedSizeBytes); + memcpy(proof->seed2, seeds->seed0, params->seedSizeBytes); + } + else { + assert(!"Invalid challenge"); + } + + if (challenge == 1 || challenge == 2) { + memcpy(proof->inputShare, views[2].inputShare, params->stateSizeBytes); + } + memcpy(proof->communicatedBits, views[(challenge + 1) % 3].communicatedBits, params->andSizeBytes); + + memcpy(proof->view3Commitment, commitments->hashes[(challenge + 2) % 3], params->digestSizeBytes); + if (params->transform == TRANSFORM_UR) { + size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; + memcpy(proof->view3UnruhG, gs->G[(challenge + 2) % 3], view3UnruhLength); + } +} + +void mpc_AND_verify(uint8_t in1[2], uint8_t in2[2], uint8_t out[2], + randomTape_t* rand, view_t* view1, view_t* view2) +{ + uint8_t r[2] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos) }; + + out[0] = (in1[0] & in2[1]) ^ (in1[1] & in2[0]) ^ (in1[0] & in2[0]) ^ r[0] ^ r[1]; + setBit(view1->communicatedBits, rand->pos, out[0]); + out[1] = getBit(view2->communicatedBits, rand->pos); + + (rand->pos)++; +} + +void mpc_substitution_verify(uint32_t* state[2], randomTape_t* rand, view_t* view1, + view_t* view2, paramset_t* params) +{ + uint32_t i; + for (i = 0; i < params->numSboxes * 3; i += 3) { + + uint8_t a[2]; + uint8_t b[2]; + uint8_t c[2]; + + uint8_t j; + for (j = 0; j < 2; j++) { + a[j] = getBitFromWordArray(state[j], i + 2); + b[j] = getBitFromWordArray(state[j], i + 1); + c[j] = getBitFromWordArray(state[j], i); + } + + uint8_t ab[2]; + uint8_t bc[2]; + uint8_t ca[2]; + + mpc_AND_verify(a, b, ab, rand, view1, view2); + mpc_AND_verify(b, c, bc, rand, view1, view2); + mpc_AND_verify(c, a, ca, rand, view1, view2); + + for (j = 0; j < 2; j++) { + setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j])); + setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j])); + setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j])); + } + } +} + +void mpc_matrix_mul(uint32_t* state[3], const uint32_t* matrix, + uint32_t* output[3], paramset_t* params, size_t players) +{ + uint32_t player; + for (player = 0; player < players; player++) { + matrix_mul(state[player], matrix, output[player], params); + } +} + +void mpc_LowMC_verify(view_t* view1, view_t* view2, + randomTape_t* tapes, uint32_t* tmp, + const uint32_t* plaintext, paramset_t* params, uint8_t challenge) +{ + uint32_t* state[2]; + uint32_t* keyShares[2]; + uint32_t* roundKey[2]; + + roundKey[0] = tmp; + roundKey[1] = roundKey[0] + params->stateSizeWords; + state[0] = roundKey[1] + params->stateSizeWords; + state[1] = state[0] + params->stateSizeWords; + + // initialize both roundkeys to 0. they are contingent + memset(roundKey[0], 0, 2 * params->stateSizeBytes); + + uint32_t i, r; + for (i = 0; i < 2; i++) { + memset(state[i], 0x00, params->stateSizeBytes); + } + mpc_xor_constant_verify(state, plaintext, params->stateSizeWords, challenge); + + keyShares[0] = view1->inputShare; + keyShares[1] = view2->inputShare; + + mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 2); + mpc_xor(state, roundKey, params->stateSizeWords, 2); + + for (r = 1; r <= params->numRounds; ++r) { + mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 2); + mpc_substitution_verify(state, tapes, view1, view2, params); + mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 2); + mpc_xor_constant_verify(state, RConstant(r - 1, params), params->stateSizeWords, challenge); + mpc_xor(state, roundKey, params->stateSizeWords, 2); + } + + memcpy(view1->outputShare, state[0], params->stateSizeBytes); + memcpy(view2->outputShare, state[1], params->stateSizeBytes); +} + +void verifyProof(const proof_t* proof, view_t* view1, view_t* view2, + uint8_t challenge, uint8_t* tmp, + const uint32_t* plaintext, randomTape_t* tape, paramset_t* params) +{ + memcpy(view2->communicatedBits, proof->communicatedBits, params->andSizeBytes); + tape->pos = 0; + + bool status = false; + switch (challenge) { + case 0: + // in this case, both views' inputs are derivable from the input share + + status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params); + memcpy(view1->inputShare, tmp, params->stateSizeBytes); + memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes); + status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params); + if (!status) { + break; + } + memcpy(view2->inputShare, tmp, params->stateSizeBytes); + memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes); + break; + + case 1: + // in this case view2's input share was already given to us explicitly as + // it is not computable from the seed. We just need to compute view1's input from + // its seed + status = createRandomTape(proof->seed1, tmp, params->stateSizeBytes + params->andSizeBytes, params); + memcpy(view1->inputShare, tmp, params->stateSizeBytes); + memcpy(tape->tape[0], tmp + params->stateSizeBytes, params->andSizeBytes); + status = status && createRandomTape(proof->seed2, tape->tape[1], params->andSizeBytes, params); + if (!status) { + break; + } + memcpy(view2->inputShare, proof->inputShare, params->stateSizeBytes); + break; + + case 2: + // in this case view1's input share was already given to us explicitly as + // it is not computable from the seed. We just need to compute view2's input from + // its seed + status = createRandomTape(proof->seed1, tape->tape[0], params->andSizeBytes, params); + memcpy(view1->inputShare, proof->inputShare, params->stateSizeBytes); + status = status && createRandomTape(proof->seed2, tmp, params->stateSizeBytes + params->andSizeBytes, params); + if (!status) { + break; + } + memcpy(view2->inputShare, tmp, params->stateSizeBytes); + memcpy(tape->tape[1], tmp + params->stateSizeBytes, params->andSizeBytes); + break; + + default: + break; + } + + mpc_LowMC_verify(view1, view2, tape, (uint32_t*)tmp, plaintext, params, challenge); +} + +int verify(signature_t* sig, const uint32_t* pubKey, const uint32_t* plaintext, + const uint8_t* message, size_t messageByteLength, paramset_t* params) +{ + commitments_t* as = allocateCommitments(params); + g_commitments_t* gs = allocateGCommitments(params); + + uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*)); + const proof_t* proofs = sig->proofs; + + const uint8_t* received_challengebits = sig->challengeBits; + int status = EXIT_SUCCESS; + uint8_t* computed_challengebits = NULL; + uint32_t* view3Slab = NULL; + + uint8_t* tmp = malloc(MAX(6 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes)); + + randomTape_t* tape = (randomTape_t*)malloc(sizeof(randomTape_t)); + + allocateRandomTape(tape, params); + + view_t* view1s = malloc(params->numZKBRounds * sizeof(view_t)); + view_t* view2s = malloc(params->numZKBRounds * sizeof(view_t)); + + /* Allocate a slab of memory for the 3rd view's output in each round */ + view3Slab = malloc(params->stateSizeBytes * params->numZKBRounds); + uint32_t* view3Output = view3Slab; /* pointer into the slab to the current 3rd view */ + + size_t i, j; + for (i = 0; i < params->numZKBRounds; i++) { + allocateView(&view1s[i], params); + allocateView(&view2s[i], params); + + // last bits of communicatedBits may not be set so zero them + view1s[i].communicatedBits[params->andSizeBytes - 1] = 0; + + verifyProof(&proofs[i], &view1s[i], &view2s[i], + getChallenge(received_challengebits, i), + tmp, plaintext, tape, params); + + // create ordered array of commitments with order computed based on the challenge + // check commitments of the two opened views + uint8_t challenge = getChallenge(received_challengebits, i); + Commit(proofs[i].seed1, view1s[i], as[i].hashes[challenge], params); + Commit(proofs[i].seed2, view2s[i], as[i].hashes[(challenge + 1) % 3], params); + memcpy(as[i].hashes[(challenge + 2) % 3], proofs[i].view3Commitment, params->digestSizeBytes); + + if (params->transform == TRANSFORM_UR) { + G(challenge, proofs[i].seed1, &view1s[i], gs[i].G[challenge], params); + G((challenge + 1) % 3, proofs[i].seed2, &view2s[i], gs[i].G[(challenge + 1) % 3], params); + size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; + memcpy(gs[i].G[(challenge + 2) % 3], proofs[i].view3UnruhG, view3UnruhLength); + } + + VIEW_OUTPUTS(i, challenge) = view1s[i].outputShare; + VIEW_OUTPUTS(i, (challenge + 1) % 3) = view2s[i].outputShare; + for (j = 0; j < params->stateSizeWords; j++) { + view3Output[j] = view1s[i].outputShare[j] ^ view2s[i].outputShare[j] + ^ pubKey[j]; + } + VIEW_OUTPUTS(i, (challenge + 2) % 3) = view3Output; + view3Output += params->stateSizeWords; + } + + computed_challengebits = malloc(numBytes(2 * params->numZKBRounds)); + + H3(pubKey, plaintext, viewOutputs, as, + computed_challengebits, message, messageByteLength, gs, params); + + if (computed_challengebits != NULL && + memcmp(received_challengebits, computed_challengebits, + numBytes(2 * params->numZKBRounds)) != 0) { + status = EXIT_FAILURE; + } + + free(computed_challengebits); + free(view3Slab); + + freeCommitments(as); + for (i = 0; i < params->numZKBRounds; i++) { + freeView(&view1s[i]); + freeView(&view2s[i]); + } + free(view1s); + free(view2s); + free(tmp); + freeRandomTape(tape); + free(tape); + freeGCommitments(gs); + free(viewOutputs); + + return status; +} + +/*** Functions implementing Sign ***/ + +void mpc_AND(uint8_t in1[3], uint8_t in2[3], uint8_t out[3], randomTape_t* rand, + view_t views[3]) +{ + uint8_t r[3] = { getBit(rand->tape[0], rand->pos), getBit(rand->tape[1], rand->pos), getBit(rand->tape[2], rand->pos) }; + + uint8_t i; + for (i = 0; i < 3; i++) { + out[i] = (in1[i] & in2[(i + 1) % 3]) ^ (in1[(i + 1) % 3] & in2[i]) + ^ (in1[i] & in2[i]) ^ r[i] ^ r[(i + 1) % 3]; + + setBit(views[i].communicatedBits, rand->pos, out[i]); + } + + (rand->pos)++; +} + +void mpc_substitution(uint32_t* state[3], randomTape_t* rand, view_t views[3], + paramset_t* params) +{ + uint8_t a[3]; + uint8_t b[3]; + uint8_t c[3]; + + uint8_t ab[3]; + uint8_t bc[3]; + uint8_t ca[3]; + + uint32_t i; + for (i = 0; i < params->numSboxes * 3; i += 3) { + + uint8_t j; + for (j = 0; j < 3; j++) { + a[j] = getBitFromWordArray(state[j], i + 2); + b[j] = getBitFromWordArray(state[j], i + 1); + c[j] = getBitFromWordArray(state[j], i); + } + + mpc_AND(a, b, ab, rand, views); + mpc_AND(b, c, bc, rand, views); + mpc_AND(c, a, ca, rand, views); + + for (j = 0; j < 3; j++) { + setBitInWordArray(state[j], i + 2, a[j] ^ (bc[j])); + setBitInWordArray(state[j], i + 1, a[j] ^ b[j] ^ (ca[j])); + setBitInWordArray(state[j], i, a[j] ^ b[j] ^ c[j] ^ (ab[j])); + } + } +} + +void mpc_LowMC(randomTape_t* tapes, view_t views[3], + const uint32_t* plaintext, uint32_t* slab, paramset_t* params) +{ + uint32_t* keyShares[3]; + uint32_t* state[3]; + uint32_t* roundKey[3]; + + roundKey[0] = slab; + roundKey[1] = slab + params->stateSizeWords; + roundKey[2] = roundKey[1] + params->stateSizeWords; + state[0] = roundKey[2] + params->stateSizeWords; + state[1] = state[0] + params->stateSizeWords; + state[2] = state[1] + params->stateSizeWords; + + memset(roundKey[0], 0, 3 * params->stateSizeBytes); + int i; + for (i = 0; i < 3; i++) { + keyShares[i] = views[i].inputShare; + memset(state[i], 0x00, params->stateSizeBytes); + } + mpc_xor_constant(state, plaintext, params->stateSizeWords); + + mpc_matrix_mul(keyShares, KMatrix(0, params), roundKey, params, 3); + mpc_xor(state, roundKey, params->stateSizeWords, 3); + + uint32_t r; + for (r = 1; r <= params->numRounds; r++) { + mpc_matrix_mul(keyShares, KMatrix(r, params), roundKey, params, 3); + mpc_substitution(state, tapes, views, params); + mpc_matrix_mul(state, LMatrix(r - 1, params), state, params, 3); + mpc_xor_constant(state, RConstant(r - 1, params), params->stateSizeWords); + mpc_xor(state, roundKey, params->stateSizeWords, 3); + } + + for (i = 0; i < 3; i++) { + memcpy(views[i].outputShare, state[i], params->stateSizeBytes); + } + +} + +void runMPC(view_t views[3], randomTape_t* rand, + uint32_t* plaintext, uint32_t* slab, paramset_t* params) +{ + rand->pos = 0; + mpc_LowMC(rand, views, plaintext, slab, params); +} + + +seeds_t* computeSeeds(uint32_t* privateKey, uint32_t* + publicKey, uint32_t* plaintext, const uint8_t* message, size_t messageByteLength, paramset_t* params) +{ + HashInstance ctx; + seeds_t* allSeeds = allocateSeeds(params); + + HashInit(&ctx, params, HASH_PREFIX_NONE); + HashUpdate(&ctx, (uint8_t*)privateKey, params->stateSizeBytes); + HashUpdate(&ctx, message, messageByteLength); + HashUpdate(&ctx, (uint8_t*)publicKey, params->stateSizeBytes); + HashUpdate(&ctx, (uint8_t*)plaintext, params->stateSizeBytes); + uint16_t stateSizeBitsLE = toLittleEndian((uint16_t)params->stateSizeBits); + HashUpdate(&ctx, ((uint8_t*)&stateSizeBitsLE), sizeof(uint16_t)); + HashFinal(&ctx); + + HashSqueeze(&ctx, getSeed(allSeeds, 0, 0), params->seedSizeBytes * 3 * params->numZKBRounds); + + return allSeeds; +} + +int sign(uint32_t* privateKey, uint32_t* pubKey, uint32_t* plaintext, const uint8_t* message, + size_t messageByteLength, signature_t* sig, paramset_t* params) +{ + bool status; + + /* Allocate views and commitments for all parallel iterations */ + view_t** views = allocateViews(params); + commitments_t* as = allocateCommitments(params); + g_commitments_t* gs = allocateGCommitments(params); + + /* Compute seeds for all parallel iterations */ + seeds_t* seeds = computeSeeds(privateKey, pubKey, plaintext, message, messageByteLength, params); + + //Allocate a random tape (re-used per parallel iteration), and a temporary buffer + randomTape_t tape; + + allocateRandomTape(&tape, params); + uint8_t* tmp = malloc( MAX(9 * params->stateSizeBytes, params->stateSizeBytes + params->andSizeBytes)); + + uint32_t k; + for (k = 0; k < params->numZKBRounds; k++) { + // for first two players get all tape INCLUDING INPUT SHARE from seed + int j; + for (j = 0; j < 2; j++) { + status = createRandomTape(getSeed(seeds, k, j), tmp, params->stateSizeBytes + params->andSizeBytes, params); + if (!status) { + return EXIT_FAILURE; + } + + memcpy(views[k][j].inputShare, tmp, params->stateSizeBytes); + memcpy(tape.tape[j], tmp + params->stateSizeBytes, params->andSizeBytes); + } + // Now set third party's wires. The random bits are from the seed, the input is + // the XOR of other two inputs and the private key + status = createRandomTape(getSeed(seeds, k, 2), tape.tape[2], params->andSizeBytes, params); + if (!status) { + return EXIT_FAILURE; + } + uint32_t j1; + for (j1 = 0; j1 < params->stateSizeWords; j1++) { + views[k][2].inputShare[j1] = privateKey[j1] + ^ views[k][0].inputShare[j1] + ^ views[k][1].inputShare[j1]; + } + + runMPC(views[k], &tape, plaintext, (uint32_t*)tmp, params); + + //Committing + Commit(getSeed(seeds, k, 0), views[k][0], as[k].hashes[0], params); + Commit(getSeed(seeds, k, 1), views[k][1], as[k].hashes[1], params); + Commit(getSeed(seeds, k, 2), views[k][2], as[k].hashes[2], params); + + if (params->transform == TRANSFORM_UR) { + G(0, getSeed(seeds, k, 0), &views[k][0], gs[k].G[0], params); + G(1, getSeed(seeds, k, 1), &views[k][1], gs[k].G[1], params); + G(2, getSeed(seeds, k, 2), &views[k][2], gs[k].G[2], params); + } + } + + //Generating challenges + uint32_t** viewOutputs = malloc(params->numZKBRounds * 3 * sizeof(uint32_t*)); + + size_t ii, jj; + for (ii = 0; ii < params->numZKBRounds; ii++) + for (jj = 0; jj < 3; jj++) + VIEW_OUTPUTS(ii, jj) = views[ii][jj].outputShare; + + + uint32_t output[LOWMC_MAX_STATE_SIZE]; + uint32_t j; + for (j = 0; j < params->stateSizeWords; j++) + output[j] = (VIEW_OUTPUTS(0, 0))[j] ^ (VIEW_OUTPUTS(0, 1))[j] ^ (VIEW_OUTPUTS(0, 2))[j]; + + + H3(output, plaintext, viewOutputs, as, + sig->challengeBits, message, messageByteLength, gs, params); + + //Packing Z + size_t i; + for (i = 0; i < params->numZKBRounds; i++) { + proof_t* proof = &sig->proofs[i]; + prove(proof, getChallenge(sig->challengeBits, i), &seeds[i], + views[i], &as[i], (gs == NULL) ? NULL : &gs[i], params); + } + + free(tmp); + + freeViews(views, params); + freeCommitments(as); + freeRandomTape(&tape); + freeGCommitments(gs); + free(viewOutputs); + freeSeeds(seeds); + + return EXIT_SUCCESS; +} + +/*** Serialization functions ***/ + +int serializeSignature(const signature_t* sig, uint8_t* sigBytes, size_t sigBytesLen, paramset_t* params) +{ + const proof_t* proofs = sig->proofs; + const uint8_t* challengeBits = sig->challengeBits; + + /* Validate input buffer is large enough */ + size_t bytesRequired = numBytes(2 * params->numZKBRounds) + + params->numZKBRounds * (2 * params->seedSizeBytes + params->stateSizeBytes + params->andSizeBytes + params->digestSizeBytes); + + if (params->transform == TRANSFORM_UR) { + bytesRequired += params->UnruhGWithoutInputBytes * params->numZKBRounds; + } + + if (sigBytesLen < bytesRequired) { + return -1; + } + + uint8_t* sigBytesBase = sigBytes; + + memcpy(sigBytes, challengeBits, numBytes(2 * params->numZKBRounds)); + sigBytes += numBytes(2 * params->numZKBRounds); + + size_t i; + for (i = 0; i < params->numZKBRounds; i++) { + + uint8_t challenge = getChallenge(challengeBits, i); + + memcpy(sigBytes, proofs[i].view3Commitment, params->digestSizeBytes); + sigBytes += params->digestSizeBytes; + + if (params->transform == TRANSFORM_UR) { + size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; + memcpy(sigBytes, proofs[i].view3UnruhG, view3UnruhLength); + sigBytes += view3UnruhLength; + } + + memcpy(sigBytes, proofs[i].communicatedBits, params->andSizeBytes); + sigBytes += params->andSizeBytes; + + memcpy(sigBytes, proofs[i].seed1, params->seedSizeBytes); + sigBytes += params->seedSizeBytes; + + memcpy(sigBytes, proofs[i].seed2, params->seedSizeBytes); + sigBytes += params->seedSizeBytes; + + if (challenge == 1 || challenge == 2) { + memcpy(sigBytes, proofs[i].inputShare, params->stateSizeBytes); + sigBytes += params->stateSizeBytes; + } + + + } + + return (int)(sigBytes - sigBytesBase); +} + + +static size_t computeInputShareSize(const uint8_t* challengeBits, size_t stateSizeBytes, paramset_t* params) +{ + /* When the FS transform is used, the input share is included in the proof + * only when the challenge is 1 or 2. When dersializing, to compute the + * number of bytes expected, we must check how many challenge values are 1 + * or 2. The parameter stateSizeBytes is the size of an input share. */ + size_t inputShareSize = 0; + + size_t i; + for (i = 0; i < params->numZKBRounds; i++) { + uint8_t challenge = getChallenge(challengeBits, i); + if (challenge == 1 || challenge == 2) { + inputShareSize += stateSizeBytes; + } + } + return inputShareSize; +} + +int deserializeSignature(signature_t* sig, const uint8_t* sigBytes, + size_t sigBytesLen, paramset_t* params) +{ + proof_t* proofs = sig->proofs; + uint8_t* challengeBits = sig->challengeBits; + + /* Validate input buffer is large enough */ + if (sigBytesLen < numBytes(2 * params->numZKBRounds)) { /* ensure the input has at least the challenge */ + return EXIT_FAILURE; + } + size_t inputShareSize = computeInputShareSize(sigBytes, params->stateSizeBytes, params); + size_t bytesExpected = numBytes(2 * params->numZKBRounds) + + params->numZKBRounds * (2 * params->seedSizeBytes + params->andSizeBytes + params->digestSizeBytes) + inputShareSize; + if (params->transform == TRANSFORM_UR) { + bytesExpected += params->UnruhGWithoutInputBytes * params->numZKBRounds; + } + if (sigBytesLen < bytesExpected) { + return EXIT_FAILURE; + } + + memcpy(challengeBits, sigBytes, numBytes(2 * params->numZKBRounds)); + sigBytes += numBytes(2 * params->numZKBRounds); + + size_t i; + for (i = 0; i < params->numZKBRounds; i++) { + + uint8_t challenge = getChallenge(challengeBits, i); + + memcpy(proofs[i].view3Commitment, sigBytes, params->digestSizeBytes); + sigBytes += params->digestSizeBytes; + + if (params->transform == TRANSFORM_UR) { + size_t view3UnruhLength = (challenge == 0) ? params->UnruhGWithInputBytes : params->UnruhGWithoutInputBytes; + memcpy(proofs[i].view3UnruhG, sigBytes, view3UnruhLength); + sigBytes += view3UnruhLength; + } + + memcpy(proofs[i].communicatedBits, sigBytes, params->andSizeBytes); + sigBytes += params->andSizeBytes; + + memcpy(proofs[i].seed1, sigBytes, params->seedSizeBytes); + sigBytes += params->seedSizeBytes; + + memcpy(proofs[i].seed2, sigBytes, params->seedSizeBytes); + sigBytes += params->seedSizeBytes; + + if (challenge == 1 || challenge == 2) { + memcpy(proofs[i].inputShare, sigBytes, params->stateSizeBytes); + sigBytes += params->stateSizeBytes; + } + + } + + return EXIT_SUCCESS; +} + + + + -- GitLab