diff --git a/src/XKCP/lib/LowLevel.build b/src/XKCP/lib/LowLevel.build index f504190eedab500cd0a9291a3ef6a5956ef82534..d6d14985844c52d6b5a58834181623af25db2828 100755 --- a/src/XKCP/lib/LowLevel.build +++ b/src/XKCP/lib/LowLevel.build @@ -62,11 +62,9 @@ The fragments below allow to select the desired implementation of the permutatio * optimized1600lcu6: same as optimized1600u6 but using the lane complementing technique, which is useful for platforms that do not have a "and not" instruction * optimized1600lcufull: same as optimized1600lcu6 but with all rounds unrolled * optimized1600lcufullshld: same as optimized1600lcufull but with the rotation implementation with the 'shld' instruction, as it is faster on some platforms (e.g., SandyBridge) - * optimized1600AsmX86-64: an assembly-optimized implementation for x86_64 * optimized1600AsmX86-64shld: same as optimized1600AsmX86-64, but with the 'shld' instruction * optimized1600AsmX86-64Apple: same as optimized1600AsmX86-64, but with a syntax that works better on some Apple platforms - * optimized1600AVX2: an optimized implementation taking advantage of the AVX2 instruction set * optimized1600AVX512a: an optimized implementation taking advantage of the AVX512 instruction set (in assembler) * optimized1600AVX512c: an optimized implementation taking advantage of the AVX512 instruction set (in C) diff --git a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c index 3336f68a36f9e6b8c66b909cc8f3187161becd7f..6dc94232cc3102ffaa11d2b01aaa824d9ebc1b9f 100755 --- a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c +++ b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c @@ -16,25 +16,6 @@ int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *i return KeccakWidth1600_Sponge(1344, 256, input, inputByteLen, 0x1F, output, outputByteLen); } -void SHAKE128_InitAbsorb(Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen) -{ - Keccak_HashInitialize_SHAKE128( ks ); - Keccak_HashUpdate( &ks, input, inputByteLen * 8 ); - Keccak_HashFinal( &ks, input ); -} - -void SHAKE256_InitAbsorb(Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen) -{ - Keccak_HashInitialize_SHAKE256( ks ); - Keccak_HashUpdate( &ks, input, inputByteLen * 8 ); - Keccak_HashFinal( &ks, input ); -} - -void KECCAK_HashSqueeze(Keccak_HashInstance *ks, const unsigned char *out, size_t outByteLen) -{ - Keccak_HashSqueeze( &ks, (unsigned char *)out, outByteLen * 8 ); -} - int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen) { return KeccakWidth1600_Sponge(1088, 512, input, inputByteLen, 0x1F, output, outputByteLen); diff --git a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h index a2fa94413903d51d0ccc5a70872ab1344e0d548b..b3c68ae842fc4f9667a4769a7b9f5e3d6f2deec8 100755 --- a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h +++ b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h @@ -12,7 +12,6 @@ http://creativecommons.org/publicdomain/zero/1.0/ #ifndef _SimpleFIPS202_h_ #define _SimpleFIPS202_h_ -#include "KeccakHash.h" #include "KeccakSpongeWidth1600.h" #include <string.h> @@ -23,6 +22,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ * @param inputByteLen The length of the input message in bytes. * @return 0 if successful, 1 otherwise. */ +int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen); /** Implementation of the SHAKE256 extendable output function (XOF) [FIPS 202]. * @param output Pointer to the output buffer. @@ -31,6 +31,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ * @param inputByteLen The length of the input message in bytes. * @return 0 if successful, 1 otherwise. */ +int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen); /** Implementation of SHA3-224 [FIPS 202]. * @param output Pointer to the output buffer (28 bytes). @@ -38,6 +39,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ * @param inputByteLen The length of the input message in bytes. * @return 0 if successful, 1 otherwise. */ +int SHA3_224(unsigned char *output, const unsigned char *input, size_t inputByteLen); /** Implementation of SHA3-256 [FIPS 202]. * @param output Pointer to the output buffer (32 bytes). @@ -45,6 +47,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ * @param inputByteLen The length of the input message in bytes. * @return 0 if successful, 1 otherwise. */ +int SHA3_256(unsigned char *output, const unsigned char *input, size_t inputByteLen); /** Implementation of SHA3-384 [FIPS 202]. * @param output Pointer to the output buffer (48 bytes). @@ -52,6 +55,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ * @param inputByteLen The length of the input message in bytes. * @return 0 if successful, 1 otherwise. */ +int SHA3_384(unsigned char *output, const unsigned char *input, size_t inputByteLen); /** Implementation of SHA3-512 [FIPS 202]. * @param output Pointer to the output buffer (64 bytes). @@ -59,17 +63,6 @@ http://creativecommons.org/publicdomain/zero/1.0/ * @param inputByteLen The length of the input message in bytes. * @return 0 if successful, 1 otherwise. */ - -int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen); -int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen); -int SHA3_224(unsigned char *output, const unsigned char *input, size_t inputByteLen); -int SHA3_256(unsigned char *output, const unsigned char *input, size_t inputByteLen); -int SHA3_384(unsigned char *output, const unsigned char *input, size_t inputByteLen); int SHA3_512(unsigned char *output, const unsigned char *input, size_t inputByteLen); -void SHAKE128_InitAbsorb( Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen ); -void SHAKE256_InitAbsorb( Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen ); -void KECCAK_HashSqueeze(Keccak_HashInstance *ks, const unsigned char *out, size_t outByteLen); - - #endif diff --git a/src/XKCP/lib/high/Keccak/KeccakSponge-common.h b/src/XKCP/lib/high/Keccak/KeccakSponge-common.h index e2b73e896ab18284e31ce2b29694a0e150d143d9..2939f42c76150f8f2050a89a46fdae9713b0ece0 100755 --- a/src/XKCP/lib/high/Keccak/KeccakSponge-common.h +++ b/src/XKCP/lib/high/Keccak/KeccakSponge-common.h @@ -18,7 +18,7 @@ http://creativecommons.org/publicdomain/zero/1.0/ #include "align.h" #define KCP_DeclareSpongeStructure(prefix, size, alignment) \ - ALIGN(64) typedef struct prefix##_SpongeInstanceStruct { \ + ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \ unsigned char state[size]; \ unsigned int rate; \ unsigned int byteIOIndex; \ diff --git a/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c b/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c index 47d4ca5de60b39c77cdbdbea5fc8533de38e7eec..99e59c69604f3a7659e1a57ed3db12fb3c99cd5a 100755 --- a/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +++ b/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c @@ -21,16 +21,6 @@ Please refer to LowLevel.build for the exact list of other files it must be comb #include <stdlib.h> #include <string.h> #include <x86intrin.h> -// #include <x86intrin.h> - -//#include <smmintrin.h> -//#include <wmmintrin.h> -//#include <immintrin.h> -//#include <emmintrin.h> - -// #include <mmintrin.h> -// #include <emmintrin.h> - #include "align.h" #include "KeccakP-1600-times2-SnP.h" #include "SIMD128-config.h" diff --git a/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c b/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c index 75c692ea26a8b4bd459480a77723e663f5157ed8..cea7584b2815439a8901d3490822c8bbe1874dbb 100755 --- a/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c +++ b/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c @@ -45,7 +45,7 @@ typedef __m256i V256; #if defined(KeccakP1600times4_useAVX2) #define ANDnu256(a, b) _mm256_andnot_si256(a, b) #define CONST256(a) _mm256_load_si256((const V256 *)&(a)) - #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a)) + #define CONST256_64(a) _mm256_set1_epi64x(a) #define LOAD256(a) _mm256_load_si256((const V256 *)&(a)) #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a)) #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) @@ -56,13 +56,13 @@ static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141 static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) - #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v) + #define STORE2_128(ah, al, v) _mm256_storeu2_m128i(&(ah), &(al), v) #define XOR256(a, b) _mm256_xor_si256(a, b) #define XOReq256(a, b) a = _mm256_xor_si256(a, b) #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) - #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) - #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) + #define PERM128( a, b, c ) _mm256_permute2f128_si256((a), (b), c) + #define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c)) #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \ lanesH01 = UNPACKH( lanes0, lanes1 ), \ diff --git a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s index 2712a48ae6b777e16ac992c375cd507712b3df60..31826dbb8231060d33d3316a026af1fe2ae381e0 100755 --- a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s +++ b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s @@ -13762,4 +13762,4 @@ KeccakF1600_FastLoop_Absorb: .size KeccakF1600_FastLoop_Absorb, .-KeccakF1600_FastLoop_Absorb .p2align 4,,15 .ident "GCC: (SUSE Linux) 4.7.4" -# .section .note.GNU-stack,"",@progbits + .section .note.GNU-stack,"",@progbits diff --git a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s index be377b4c23a78b534c432309a7656f3bdd5754f5..d36c3fd0b3d0c58f3e94815b17f357a1846705d4 100755 --- a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s +++ b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s @@ -1,9 +1,8 @@ .file "KeccakP-1600-opt64.c" .text - .section .text$KeccakP1600_StaticInitialize,"x" .p2align 4,,15 .globl KeccakP1600_StaticInitialize - #.type KeccakP1600_StaticInitialize, @function + .type KeccakP1600_StaticInitialize, @function KeccakP1600_StaticInitialize: .LFB22: .cfi_startproc @@ -11,11 +10,10 @@ KeccakP1600_StaticInitialize: ret .cfi_endproc .LFE22: - #.size KeccakP1600_StaticInitialize, .-KeccakP1600_StaticInitialize - .section .text$KeccakP1600_Initialize,"x" + .size KeccakP1600_StaticInitialize, .-KeccakP1600_StaticInitialize .p2align 4,,15 .globl KeccakP1600_Initialize - #.type KeccakP1600_Initialize, @function + .type KeccakP1600_Initialize, @function KeccakP1600_Initialize: .LFB23: .cfi_startproc @@ -79,10 +77,10 @@ KeccakP1600_Initialize: jmp .L5 .cfi_endproc .LFE23: - #.size KeccakP1600_Initialize, .-KeccakP1600_Initialize + .size KeccakP1600_Initialize, .-KeccakP1600_Initialize .p2align 4,,15 .globl KeccakP1600_AddBytesInLane - #.type KeccakP1600_AddBytesInLane, @function + .type KeccakP1600_AddBytesInLane, @function KeccakP1600_AddBytesInLane: .LFB24: .cfi_startproc @@ -131,10 +129,10 @@ KeccakP1600_AddBytesInLane: jmp .L34 .cfi_endproc .LFE24: - #.size KeccakP1600_AddBytesInLane, .-KeccakP1600_AddBytesInLane + .size KeccakP1600_AddBytesInLane, .-KeccakP1600_AddBytesInLane .p2align 4,,15 .globl KeccakP1600_AddLanes - #.type KeccakP1600_AddLanes, @function + .type KeccakP1600_AddLanes, @function KeccakP1600_AddLanes: .LFB25: .cfi_startproc @@ -244,10 +242,10 @@ KeccakP1600_AddLanes: jmp .L45 .cfi_endproc .LFE25: - #.size KeccakP1600_AddLanes, .-KeccakP1600_AddLanes + .size KeccakP1600_AddLanes, .-KeccakP1600_AddLanes .p2align 4,,15 .globl KeccakP1600_AddByte - #.type KeccakP1600_AddByte, @function + .type KeccakP1600_AddByte, @function KeccakP1600_AddByte: .LFB26: .cfi_startproc @@ -261,10 +259,10 @@ KeccakP1600_AddByte: ret .cfi_endproc .LFE26: - #.size KeccakP1600_AddByte, .-KeccakP1600_AddByte + .size KeccakP1600_AddByte, .-KeccakP1600_AddByte .p2align 4,,15 .globl KeccakP1600_AddBytes - #.type KeccakP1600_AddBytes, @function + .type KeccakP1600_AddBytes, @function KeccakP1600_AddBytes: .LFB27: .cfi_startproc @@ -377,10 +375,10 @@ KeccakP1600_AddBytes: jmp .L66 .cfi_endproc .LFE27: - #.size KeccakP1600_AddBytes, .-KeccakP1600_AddBytes + .size KeccakP1600_AddBytes, .-KeccakP1600_AddBytes .p2align 4,,15 .globl KeccakP1600_OverwriteBytesInLane - #.type KeccakP1600_OverwriteBytesInLane, @function + .type KeccakP1600_OverwriteBytesInLane, @function KeccakP1600_OverwriteBytesInLane: .LFB28: .cfi_startproc @@ -430,10 +428,10 @@ KeccakP1600_OverwriteBytesInLane: jmp memcpy .cfi_endproc .LFE28: - #.size KeccakP1600_OverwriteBytesInLane, .-KeccakP1600_OverwriteBytesInLane + .size KeccakP1600_OverwriteBytesInLane, .-KeccakP1600_OverwriteBytesInLane .p2align 4,,15 .globl KeccakP1600_OverwriteLanes - #.type KeccakP1600_OverwriteLanes, @function + .type KeccakP1600_OverwriteLanes, @function KeccakP1600_OverwriteLanes: .LFB29: .cfi_startproc @@ -478,10 +476,10 @@ KeccakP1600_OverwriteLanes: ret .cfi_endproc .LFE29: - #.size KeccakP1600_OverwriteLanes, .-KeccakP1600_OverwriteLanes + .size KeccakP1600_OverwriteLanes, .-KeccakP1600_OverwriteLanes .p2align 4,,15 .globl KeccakP1600_OverwriteBytes - #.type KeccakP1600_OverwriteBytes, @function + .type KeccakP1600_OverwriteBytes, @function KeccakP1600_OverwriteBytes: .LFB30: .cfi_startproc @@ -695,10 +693,10 @@ KeccakP1600_OverwriteBytes: jmp .L110 .cfi_endproc .LFE30: - #.size KeccakP1600_OverwriteBytes, .-KeccakP1600_OverwriteBytes + .size KeccakP1600_OverwriteBytes, .-KeccakP1600_OverwriteBytes .p2align 4,,15 .globl KeccakP1600_OverwriteWithZeroes - #.type KeccakP1600_OverwriteWithZeroes, @function + .type KeccakP1600_OverwriteWithZeroes, @function KeccakP1600_OverwriteWithZeroes: .LFB31: .cfi_startproc @@ -774,10 +772,10 @@ KeccakP1600_OverwriteWithZeroes: ret .cfi_endproc .LFE31: - #.size KeccakP1600_OverwriteWithZeroes, .-KeccakP1600_OverwriteWithZeroes + .size KeccakP1600_OverwriteWithZeroes, .-KeccakP1600_OverwriteWithZeroes .p2align 4,,15 .globl KeccakP1600_Permute_24rounds - #.type KeccakP1600_Permute_24rounds, @function + .type KeccakP1600_Permute_24rounds, @function KeccakP1600_Permute_24rounds: .LFB32: .cfi_startproc @@ -8217,10 +8215,10 @@ KeccakP1600_Permute_24rounds: ret .cfi_endproc .LFE32: - #.size KeccakP1600_Permute_24rounds, .-KeccakP1600_Permute_24rounds + .size KeccakP1600_Permute_24rounds, .-KeccakP1600_Permute_24rounds .p2align 4,,15 .globl KeccakP1600_Permute_12rounds - #.type KeccakP1600_Permute_12rounds, @function + .type KeccakP1600_Permute_12rounds, @function KeccakP1600_Permute_12rounds: .LFB33: .cfi_startproc @@ -11945,10 +11943,10 @@ KeccakP1600_Permute_12rounds: ret .cfi_endproc .LFE33: - #.size KeccakP1600_Permute_12rounds, .-KeccakP1600_Permute_12rounds + .size KeccakP1600_Permute_12rounds, .-KeccakP1600_Permute_12rounds .p2align 4,,15 .globl KeccakP1600_ExtractBytesInLane - #.type KeccakP1600_ExtractBytesInLane, @function + .type KeccakP1600_ExtractBytesInLane, @function KeccakP1600_ExtractBytesInLane: .LFB34: .cfi_startproc @@ -11991,10 +11989,10 @@ KeccakP1600_ExtractBytesInLane: jmp .L162 .cfi_endproc .LFE34: - #.size KeccakP1600_ExtractBytesInLane, .-KeccakP1600_ExtractBytesInLane + .size KeccakP1600_ExtractBytesInLane, .-KeccakP1600_ExtractBytesInLane .p2align 4,,15 .globl KeccakP1600_ExtractLanes - #.type KeccakP1600_ExtractLanes, @function + .type KeccakP1600_ExtractLanes, @function KeccakP1600_ExtractLanes: .LFB35: .cfi_startproc @@ -12038,10 +12036,10 @@ KeccakP1600_ExtractLanes: ret .cfi_endproc .LFE35: - #.size KeccakP1600_ExtractLanes, .-KeccakP1600_ExtractLanes + .size KeccakP1600_ExtractLanes, .-KeccakP1600_ExtractLanes .p2align 4,,15 .globl KeccakP1600_ExtractBytes - #.type KeccakP1600_ExtractBytes, @function + .type KeccakP1600_ExtractBytes, @function KeccakP1600_ExtractBytes: .LFB36: .cfi_startproc @@ -12204,10 +12202,10 @@ KeccakP1600_ExtractBytes: jmp .L180 .cfi_endproc .LFE36: - #.size KeccakP1600_ExtractBytes, .-KeccakP1600_ExtractBytes + .size KeccakP1600_ExtractBytes, .-KeccakP1600_ExtractBytes .p2align 4,,15 .globl KeccakP1600_ExtractAndAddBytesInLane - #.type KeccakP1600_ExtractAndAddBytesInLane, @function + .type KeccakP1600_ExtractAndAddBytesInLane, @function KeccakP1600_ExtractAndAddBytesInLane: .LFB37: .cfi_startproc @@ -12263,10 +12261,10 @@ KeccakP1600_ExtractAndAddBytesInLane: jmp .L193 .cfi_endproc .LFE37: - #.size KeccakP1600_ExtractAndAddBytesInLane, .-KeccakP1600_ExtractAndAddBytesInLane + .size KeccakP1600_ExtractAndAddBytesInLane, .-KeccakP1600_ExtractAndAddBytesInLane .p2align 4,,15 .globl KeccakP1600_ExtractAndAddLanes - #.type KeccakP1600_ExtractAndAddLanes, @function + .type KeccakP1600_ExtractAndAddLanes, @function KeccakP1600_ExtractAndAddLanes: .LFB38: .cfi_startproc @@ -12352,10 +12350,10 @@ KeccakP1600_ExtractAndAddLanes: jmp .L213 .cfi_endproc .LFE38: - #.size KeccakP1600_ExtractAndAddLanes, .-KeccakP1600_ExtractAndAddLanes + .size KeccakP1600_ExtractAndAddLanes, .-KeccakP1600_ExtractAndAddLanes .p2align 4,,15 .globl KeccakP1600_ExtractAndAddBytes - #.type KeccakP1600_ExtractAndAddBytes, @function + .type KeccakP1600_ExtractAndAddBytes, @function KeccakP1600_ExtractAndAddBytes: .LFB39: .cfi_startproc @@ -12562,10 +12560,10 @@ KeccakP1600_ExtractAndAddBytes: jmp .L243 .cfi_endproc .LFE39: - #.size KeccakP1600_ExtractAndAddBytes, .-KeccakP1600_ExtractAndAddBytes + .size KeccakP1600_ExtractAndAddBytes, .-KeccakP1600_ExtractAndAddBytes .p2align 4,,15 .globl KeccakF1600_FastLoop_Absorb - #.type KeccakF1600_FastLoop_Absorb, @function + .type KeccakF1600_FastLoop_Absorb, @function KeccakF1600_FastLoop_Absorb: .LFB40: .cfi_startproc @@ -20324,6 +20322,6 @@ KeccakF1600_FastLoop_Absorb: jmp .L255 .cfi_endproc .LFE40: - #.size KeccakF1600_FastLoop_Absorb, .-KeccakF1600_FastLoop_Absorb -# .ident "GCC: (SUSE Linux) 4.7.4" -# .section .note.GNU-stack,"",@progbits + .size KeccakF1600_FastLoop_Absorb, .-KeccakF1600_FastLoop_Absorb + .ident "GCC: (SUSE Linux) 4.7.4" + .section .note.GNU-stack,"",@progbits diff --git a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s index 2eea40444c356b7ec383ac27904ad9e789c1e77a..65aff45cb064fb1773bc06dea717e50cdd70c88d 100755 --- a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s +++ b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s @@ -22,7 +22,7 @@ .text # conditional assembly settings -.equ UseSIMD, 1 +.equ UseSIMD, 0 .equ InlinePerm, 1 # offsets in state @@ -474,7 +474,7 @@ .if InlinePerm == 1 mKeccakPermutation24 .else - callq KeccakP1600_Permute_24rounds + callq KeccakP1600_Permute_24rounds@PLT .endif .endm @@ -749,7 +749,7 @@ KeccakLaneComplementTable: .type KeccakP1600_OverwriteBytes, %function KeccakP1600_OverwriteBytes: addq arg3, arg1 - leaq KeccakLaneComplementTable, arg5 + leaq KeccakLaneComplementTable(%rip), arg5 addq arg3, arg5 subq $8, arg4 jc KeccakP1600_OverwriteBytes_Bytes @@ -786,7 +786,7 @@ KeccakP1600_OverwriteBytes_Exit: .global KeccakP1600_OverwriteWithZeroes .type KeccakP1600_OverwriteWithZeroes, %function KeccakP1600_OverwriteWithZeroes: - leaq KeccakLaneComplementTable, arg5 + leaq KeccakLaneComplementTable(%rip), arg5 subq $8, arg2 jc KeccakP1600_OverwriteWithZeroes_Bytes KeccakP1600_OverwriteWithZeroes_LanesLoop: @@ -821,7 +821,7 @@ KeccakP1600_OverwriteWithZeroes_Exit: .type KeccakP1600_ExtractBytes, %function KeccakP1600_ExtractBytes: addq arg3, arg1 - leaq KeccakLaneComplementTable, arg5 + leaq KeccakLaneComplementTable(%rip), arg5 addq arg3, arg5 subq $8, arg4 jc KeccakP1600_ExtractBytes_Bytes @@ -859,7 +859,7 @@ KeccakP1600_ExtractBytes_Exit: .type KeccakP1600_ExtractAndAddBytes, %function KeccakP1600_ExtractAndAddBytes: addq arg4, arg1 - leaq KeccakLaneComplementTable, arg6 + leaq KeccakLaneComplementTable(%rip), arg6 addq arg4, arg6 subq $8, arg5 jc KeccakP1600_ExtractAndAddBytes_Bytes @@ -980,7 +980,8 @@ KeccakP1600_Permute_Nrounds: movq rT2a, _su(rpStack) KeccakP1600_Permute_Nrounds_Dispatch: shlq $3, rT1 - jmp *KeccakP1600_Permute_NroundsTable-8(rT1) + leaq KeccakP1600_Permute_NroundsTable-8(%rip), rT2a + jmp *(rT1, rT2a) KeccakP1600_Permute_Nrounds24: mKeccakRound rpState, rpStack, 0x0000000000000001, 0 @@ -1174,13 +1175,13 @@ KeccakF1600_FastLoop_Absorb_VariableLaneCountLoop: shlq $3, arg4 movq arg3, arg2 # data pointer xorq arg3, arg3 # offset = 0 - callq KeccakP1600_AddBytes # (void *state, const unsigned char *data, unsigned int offset, unsigned int length) + callq KeccakP1600_AddBytes@PLT # (void *state, const unsigned char *data, unsigned int offset, unsigned int length) movq arg2, arg3 # updated data pointer movq 24(%rsp), rT1a # xor trailingBits xorq rT1a, (arg1) popq arg1 pushq arg3 - callq KeccakP1600_Permute_24rounds + callq KeccakP1600_Permute_24rounds@PLT popq arg3 popq arg2 popq arg4 diff --git a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s index 9e3e573274c139f29f5301393330e64a3f9271e8..cf4a1b1e877e4864747521a93d60803fbe235d3e 100755 --- a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s +++ b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s @@ -474,7 +474,7 @@ .if InlinePerm == 1 mKeccakPermutation24 .else - callq KeccakP1600_Permute_24rounds + callq KeccakP1600_Permute_24rounds@PLT .endif .endm @@ -749,7 +749,7 @@ KeccakLaneComplementTable: .type KeccakP1600_OverwriteBytes, %function KeccakP1600_OverwriteBytes: addq arg3, arg1 - leaq KeccakLaneComplementTable, arg5 + leaq KeccakLaneComplementTable(%rip), arg5 addq arg3, arg5 subq $8, arg4 jc KeccakP1600_OverwriteBytes_Bytes @@ -786,7 +786,7 @@ KeccakP1600_OverwriteBytes_Exit: .global KeccakP1600_OverwriteWithZeroes .type KeccakP1600_OverwriteWithZeroes, %function KeccakP1600_OverwriteWithZeroes: - leaq KeccakLaneComplementTable, arg5 + leaq KeccakLaneComplementTable(%rip), arg5 subq $8, arg2 jc KeccakP1600_OverwriteWithZeroes_Bytes KeccakP1600_OverwriteWithZeroes_LanesLoop: @@ -821,7 +821,7 @@ KeccakP1600_OverwriteWithZeroes_Exit: .type KeccakP1600_ExtractBytes, %function KeccakP1600_ExtractBytes: addq arg3, arg1 - leaq KeccakLaneComplementTable, arg5 + leaq KeccakLaneComplementTable(%rip), arg5 addq arg3, arg5 subq $8, arg4 jc KeccakP1600_ExtractBytes_Bytes @@ -859,7 +859,7 @@ KeccakP1600_ExtractBytes_Exit: .type KeccakP1600_ExtractAndAddBytes, %function KeccakP1600_ExtractAndAddBytes: addq arg4, arg1 - leaq KeccakLaneComplementTable, arg6 + leaq KeccakLaneComplementTable(%rip), arg6 addq arg4, arg6 subq $8, arg5 jc KeccakP1600_ExtractAndAddBytes_Bytes @@ -980,7 +980,8 @@ KeccakP1600_Permute_Nrounds: movq rT2a, _su(rpStack) KeccakP1600_Permute_Nrounds_Dispatch: shlq $3, rT1 - jmp *KeccakP1600_Permute_NroundsTable-8(rT1) + leaq KeccakP1600_Permute_NroundsTable-8(%rip), rT2a + jmp *(rT1, rT2a) KeccakP1600_Permute_Nrounds24: mKeccakRound rpState, rpStack, 0x0000000000000001, 0 @@ -1174,13 +1175,13 @@ KeccakF1600_FastLoop_Absorb_VariableLaneCountLoop: shlq $3, arg4 movq arg3, arg2 # data pointer xorq arg3, arg3 # offset = 0 - callq KeccakP1600_AddBytes # (void *state, const unsigned char *data, unsigned int offset, unsigned int length) + callq KeccakP1600_AddBytes@PLT # (void *state, const unsigned char *data, unsigned int offset, unsigned int length) movq arg2, arg3 # updated data pointer movq 24(%rsp), rT1a # xor trailingBits xorq rT1a, (arg1) popq arg1 pushq arg3 - callq KeccakP1600_Permute_24rounds + callq KeccakP1600_Permute_24rounds@PLT popq arg3 popq arg2 popq arg4 diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s index 403058ef294de23e976125af6ba24781b07d46b1..b0e6dc83048ba6abed8344483d02a7ea3a584d55 100755 --- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s +++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s @@ -224,7 +224,7 @@ Xoodoo_ExtractAndAddBytes_Exit Xoodoo_Permute_6rounds PROC vpush {q4-q5} vldmia r0, {q0-q2} - ldr r1, = _rc6 + adr r1, _rc6 mRound mRound mRound @@ -260,7 +260,7 @@ _rc6 Xoodoo_Permute_12rounds PROC vpush {q4-q5} vldmia r0, {q0-q2} - ldr r1, = _rc12 + adr r1, _rc12 mRound mRound mRound @@ -371,7 +371,7 @@ Xoofff_CompressFastLoop PROC vld1.32 {q11}, [r1] Xoofff_CompressFastLoop_Loop vld1.32 {q0,q1}, [r2]! ; get input - ldr r1, = _rc6 + adr r1, _rc6b vld1.32 {q2}, [r2]! veor.32 q0, q0, q6 veor.32 q1, q1, q7 @@ -403,7 +403,15 @@ Xoofff_CompressFastLoop_Loop sub r0, r2, r6 ; return number of bytes processed vpop {q4-q7} pop {r4,r5,r6,pc} + LTORG align 8 +_rc6b + dcq 0x00000060 + dcq 0x0000002C + dcq 0x00000380 + dcq 0x000000F0 + dcq 0x000001A0 + dcq 0x00000012 ENDP ; ---------------------------------------------------------------------------- @@ -425,7 +433,7 @@ Xoofff_ExpandFastLoop_Loop vmov q0, q9 vmov q1, q10 vmov q2, q11 - ldr r1, = _rc6 + adr r1, _rc6b mRound ; permutation mRound mRound diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s index bd6e75667e60f22dab156e082418644f5b265002..9b523ca0030607ac4e6c4a91498e22fc462cfc76 100755 --- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s +++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s @@ -230,7 +230,7 @@ Xoodoo_ExtractAndAddBytes_Exit: Xoodoo_Permute_6rounds: vpush {q4-q5} vldmia r0, {q0-q2} - ldr r1, = _rc6 + adr r1, _rc6 mRound mRound mRound @@ -267,7 +267,7 @@ _rc6: Xoodoo_Permute_12rounds: vpush {q4-q5} vldmia r0, {q0-q2} - ldr r1, = _rc12 + adr r1, _rc12 mRound mRound mRound @@ -380,7 +380,7 @@ Xoofff_CompressFastLoop: vld1.32 {q11}, [r1] Xoofff_CompressFastLoop_Loop: vld1.32 {q0,q1}, [r2]! @ get input - ldr r1, = _rc6 + adr r1, _rc6b vld1.32 {q2}, [r2]! veor.32 q0, q0, q6 veor.32 q1, q1, q7 @@ -412,7 +412,15 @@ Xoofff_CompressFastLoop_Loop: sub r0, r2, r6 @ return number of bytes processed vpop {q4-q7} pop {r4,r5,r6,pc} + .ltorg .align 8 +_rc6b: + .quad 0x00000060 + .quad 0x0000002C + .quad 0x00000380 + .quad 0x000000F0 + .quad 0x000001A0 + .quad 0x00000012 @ ---------------------------------------------------------------------------- @@ -435,7 +443,7 @@ Xoofff_ExpandFastLoop_Loop: vmov q0, q9 vmov q1, q10 vmov q2, q11 - ldr r1, = _rc6 + adr r1, _rc6b mRound @ permutation mRound mRound diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s index df5c5d31aaaa765ac8e477d9bf5ffa7c7dcc5b24..b4937f0af08e9c6264a3560d564de008dd1a8ac7 100755 --- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s +++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s @@ -64,7 +64,7 @@ Xoodyak_Rhash equ 16 ; align 8 Xoodoo_Permute_12roundsAsm PROC - ldr r1, =_rc12 + adr r1, _rc12 mRound mRound mRound diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s index 2b9b018f057b36ed1a85b07dd71012bad6e6182c..d701e08b1c9b029524a9faa71a303c0b65c95090 100755 --- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s +++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s @@ -64,7 +64,7 @@ .align 8 .type Xoodoo_Permute_12roundsAsm, %function; Xoodoo_Permute_12roundsAsm: - ldr r1, =_rc12 + adr r1, _rc12 mRound mRound mRound diff --git a/src/XKCP/tests/UnitTests/timing.h b/src/XKCP/tests/UnitTests/timing.h index 2fdd0c141389272e7a51de54120517f2463efcb2..eed7291989f370aaea701be158ddd54f4d9a883b 100755 --- a/src/XKCP/tests/UnitTests/timing.h +++ b/src/XKCP/tests/UnitTests/timing.h @@ -80,7 +80,7 @@ static uint_32t HiResTime(void) /* return the current value of time st uint32_t cycle_count; asm volatile("MRS %0, pmevcntr0_el0" : "=r" (cycle_count)); return cycle_count; - #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A_) + #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) /* Implement Aarch32 bits on PMU : Tested on Cortex a-53 */ uint32_t cycle_count; asm volatile("MRC p15, 0, %0, c9, c13, 0 \t\n" : "=r"(cycle_count)); @@ -115,10 +115,10 @@ static uint_32t calibrate() uint32_t r = 0; asm volatile("mrs %0, pmcntenset_el0" : "=r" (r)); asm volatile("msr pmcntenset_el0, %0" : : "r" (r|1)); - #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A_) + #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) /* Implement Aarch32 bits on PMU : Tested on Cortex a-53 */ /* Enable counters in Control Register and reset cycle count and event count */ - printf("PMU32 Enable.. \n"); + printf("PMU32 Enabled... \n"); asm volatile("MCR p15, 0, %0, c9, c12, 0" : : "r"(0x00000007)); /* Event counter selection register, which counter to access */ asm volatile("MCR p15, 0, %0, c9, c12, 5" : : "r"(0x0));