diff --git a/src/XKCP/lib/LowLevel.build b/src/XKCP/lib/LowLevel.build
index f504190eedab500cd0a9291a3ef6a5956ef82534..d6d14985844c52d6b5a58834181623af25db2828 100755
--- a/src/XKCP/lib/LowLevel.build
+++ b/src/XKCP/lib/LowLevel.build
@@ -62,11 +62,9 @@ The fragments below allow to select the desired implementation of the permutatio
 * optimized1600lcu6: same as optimized1600u6 but using the lane complementing technique, which is useful for platforms that do not have a "and not" instruction
 * optimized1600lcufull: same as optimized1600lcu6 but with all rounds unrolled
 * optimized1600lcufullshld: same as optimized1600lcufull but with the rotation implementation with the 'shld' instruction, as it is faster on some platforms (e.g., SandyBridge)
-
 * optimized1600AsmX86-64: an assembly-optimized implementation for x86_64
 * optimized1600AsmX86-64shld: same as optimized1600AsmX86-64, but with the 'shld' instruction
 * optimized1600AsmX86-64Apple: same as optimized1600AsmX86-64, but with a syntax that works better on some Apple platforms
-
 * optimized1600AVX2: an optimized implementation taking advantage of the AVX2 instruction set
 * optimized1600AVX512a: an optimized implementation taking advantage of the AVX512 instruction set (in assembler)
 * optimized1600AVX512c: an optimized implementation taking advantage of the AVX512 instruction set (in C)
diff --git a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c
index 3336f68a36f9e6b8c66b909cc8f3187161becd7f..6dc94232cc3102ffaa11d2b01aaa824d9ebc1b9f 100755
--- a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c
+++ b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.c
@@ -16,25 +16,6 @@ int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *i
     return KeccakWidth1600_Sponge(1344, 256, input, inputByteLen, 0x1F, output, outputByteLen);
 }
 
-void SHAKE128_InitAbsorb(Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen)
-{
-    Keccak_HashInitialize_SHAKE128( ks );
-    Keccak_HashUpdate( &ks, input, inputByteLen * 8 );
-    Keccak_HashFinal( &ks, input );
-}
-
-void SHAKE256_InitAbsorb(Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen)
-{
-    Keccak_HashInitialize_SHAKE256( ks );
-    Keccak_HashUpdate( &ks, input, inputByteLen * 8 );
-    Keccak_HashFinal( &ks, input );
-}
-
-void KECCAK_HashSqueeze(Keccak_HashInstance *ks, const unsigned char *out, size_t outByteLen)
-{
-    Keccak_HashSqueeze( &ks, (unsigned char *)out, outByteLen * 8 );
-}
-
 int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen)
 {
     return KeccakWidth1600_Sponge(1088, 512, input, inputByteLen, 0x1F, output, outputByteLen);
diff --git a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h
index a2fa94413903d51d0ccc5a70872ab1344e0d548b..b3c68ae842fc4f9667a4769a7b9f5e3d6f2deec8 100755
--- a/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h
+++ b/src/XKCP/lib/high/Keccak/FIPS202/SimpleFIPS202.h
@@ -12,7 +12,6 @@ http://creativecommons.org/publicdomain/zero/1.0/
 #ifndef _SimpleFIPS202_h_
 #define _SimpleFIPS202_h_
 
-#include "KeccakHash.h"
 #include "KeccakSpongeWidth1600.h"
 #include <string.h>
 
@@ -23,6 +22,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
   * @param  inputByteLen    The length of the input message in bytes.
   * @return 0 if successful, 1 otherwise.
   */
+int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
 
 /** Implementation of the SHAKE256 extendable output function (XOF) [FIPS 202].
   * @param  output          Pointer to the output buffer.
@@ -31,6 +31,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
   * @param  inputByteLen    The length of the input message in bytes.
   * @return 0 if successful, 1 otherwise.
   */
+int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
 
 /** Implementation of SHA3-224 [FIPS 202].
   * @param  output          Pointer to the output buffer (28 bytes).
@@ -38,6 +39,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
   * @param  inputByteLen    The length of the input message in bytes.
   * @return 0 if successful, 1 otherwise.
   */
+int SHA3_224(unsigned char *output, const unsigned char *input, size_t inputByteLen);
 
 /** Implementation of SHA3-256 [FIPS 202].
   * @param  output          Pointer to the output buffer (32 bytes).
@@ -45,6 +47,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
   * @param  inputByteLen    The length of the input message in bytes.
   * @return 0 if successful, 1 otherwise.
   */
+int SHA3_256(unsigned char *output, const unsigned char *input, size_t inputByteLen);
 
 /** Implementation of SHA3-384 [FIPS 202].
   * @param  output          Pointer to the output buffer (48 bytes).
@@ -52,6 +55,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
   * @param  inputByteLen    The length of the input message in bytes.
   * @return 0 if successful, 1 otherwise.
   */
+int SHA3_384(unsigned char *output, const unsigned char *input, size_t inputByteLen);
 
 /** Implementation of SHA3-512 [FIPS 202].
   * @param  output          Pointer to the output buffer (64 bytes).
@@ -59,17 +63,6 @@ http://creativecommons.org/publicdomain/zero/1.0/
   * @param  inputByteLen    The length of the input message in bytes.
   * @return 0 if successful, 1 otherwise.
   */
-
-int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
-int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
-int SHA3_224(unsigned char *output, const unsigned char *input, size_t inputByteLen);
-int SHA3_256(unsigned char *output, const unsigned char *input, size_t inputByteLen);
-int SHA3_384(unsigned char *output, const unsigned char *input, size_t inputByteLen);
 int SHA3_512(unsigned char *output, const unsigned char *input, size_t inputByteLen);
 
-void SHAKE128_InitAbsorb( Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen );
-void SHAKE256_InitAbsorb( Keccak_HashInstance *ks, const unsigned char *input, size_t inputByteLen );
-void KECCAK_HashSqueeze(Keccak_HashInstance *ks, const unsigned char *out, size_t outByteLen);
-
-
 #endif
diff --git a/src/XKCP/lib/high/Keccak/KeccakSponge-common.h b/src/XKCP/lib/high/Keccak/KeccakSponge-common.h
index e2b73e896ab18284e31ce2b29694a0e150d143d9..2939f42c76150f8f2050a89a46fdae9713b0ece0 100755
--- a/src/XKCP/lib/high/Keccak/KeccakSponge-common.h
+++ b/src/XKCP/lib/high/Keccak/KeccakSponge-common.h
@@ -18,7 +18,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
 #include "align.h"
 
 #define KCP_DeclareSpongeStructure(prefix, size, alignment) \
-    ALIGN(64) typedef struct prefix##_SpongeInstanceStruct { \
+    ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
         unsigned char state[size]; \
         unsigned int rate; \
         unsigned int byteIOIndex; \
diff --git a/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c b/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c
index 47d4ca5de60b39c77cdbdbea5fc8533de38e7eec..99e59c69604f3a7659e1a57ed3db12fb3c99cd5a 100755
--- a/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c
+++ b/src/XKCP/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c
@@ -21,16 +21,6 @@ Please refer to LowLevel.build for the exact list of other files it must be comb
 #include <stdlib.h>
 #include <string.h>
 #include <x86intrin.h>
-//    #include <x86intrin.h>
-
-//#include <smmintrin.h>
-//#include <wmmintrin.h>
-//#include <immintrin.h>
-//#include <emmintrin.h>
-
-//    #include <mmintrin.h>
-//    #include <emmintrin.h>
-
 #include "align.h"
 #include "KeccakP-1600-times2-SnP.h"
 #include "SIMD128-config.h"
diff --git a/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c b/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c
index 75c692ea26a8b4bd459480a77723e663f5157ed8..cea7584b2815439a8901d3490822c8bbe1874dbb 100755
--- a/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c
+++ b/src/XKCP/lib/low/KeccakP-1600-times4/SIMD256/KeccakP-1600-times4-SIMD256.c
@@ -45,7 +45,7 @@ typedef __m256i V256;
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
     #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
-    #define CONST256_64(a)          (V256)_mm256_broadcast_sd((const double*)(&a))
+    #define CONST256_64(a)          _mm256_set1_epi64x(a)
     #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
@@ -56,13 +56,13 @@ static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141
 static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
     #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
-    #define STORE2_128(ah, al, v)   _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
+    #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
     #define XOReq256(a, b)          a = _mm256_xor_si256(a, b)
     #define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))
     #define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))
-    #define PERM128( a, b, c )      (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
-    #define SHUFFLE64( a, b, c )    (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
+    #define PERM128( a, b, c )      _mm256_permute2f128_si256((a), (b), c)
+    #define SHUFFLE64( a, b, c )    _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
 
     #define UNINTLEAVE()            lanesL01 = UNPACKL( lanes0, lanes1 ),                   \
                                     lanesH01 = UNPACKH( lanes0, lanes1 ),                   \
diff --git a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s
index 2712a48ae6b777e16ac992c375cd507712b3df60..31826dbb8231060d33d3316a026af1fe2ae381e0 100755
--- a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s
+++ b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forHaswell/KeccakP-1600-opt64.s
@@ -13762,4 +13762,4 @@ KeccakF1600_FastLoop_Absorb:
     .size   KeccakF1600_FastLoop_Absorb, .-KeccakF1600_FastLoop_Absorb
     .p2align 4,,15
     .ident  "GCC: (SUSE Linux) 4.7.4"
-#    .section    .note.GNU-stack,"",@progbits
+    .section    .note.GNU-stack,"",@progbits
diff --git a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s
index be377b4c23a78b534c432309a7656f3bdd5754f5..d36c3fd0b3d0c58f3e94815b17f357a1846705d4 100755
--- a/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s
+++ b/src/XKCP/lib/low/KeccakP-1600/Optimized64/CompiledByGCC474forSandyBridge/KeccakP-1600-opt64.s
@@ -1,9 +1,8 @@
     .file   "KeccakP-1600-opt64.c"
     .text
-   	.section	.text$KeccakP1600_StaticInitialize,"x"
     .p2align 4,,15
     .globl  KeccakP1600_StaticInitialize
-    #.type   KeccakP1600_StaticInitialize, @function
+    .type   KeccakP1600_StaticInitialize, @function
 KeccakP1600_StaticInitialize:
 .LFB22:
     .cfi_startproc
@@ -11,11 +10,10 @@ KeccakP1600_StaticInitialize:
     ret
     .cfi_endproc
 .LFE22:
-    #.size   KeccakP1600_StaticInitialize, .-KeccakP1600_StaticInitialize
-   	.section	.text$KeccakP1600_Initialize,"x"
+    .size   KeccakP1600_StaticInitialize, .-KeccakP1600_StaticInitialize
     .p2align 4,,15
     .globl  KeccakP1600_Initialize
-    #.type   KeccakP1600_Initialize, @function
+    .type   KeccakP1600_Initialize, @function
 KeccakP1600_Initialize:
 .LFB23:
     .cfi_startproc
@@ -79,10 +77,10 @@ KeccakP1600_Initialize:
     jmp .L5
     .cfi_endproc
 .LFE23:
-    #.size   KeccakP1600_Initialize, .-KeccakP1600_Initialize
+    .size   KeccakP1600_Initialize, .-KeccakP1600_Initialize
     .p2align 4,,15
     .globl  KeccakP1600_AddBytesInLane
-    #.type   KeccakP1600_AddBytesInLane, @function
+    .type   KeccakP1600_AddBytesInLane, @function
 KeccakP1600_AddBytesInLane:
 .LFB24:
     .cfi_startproc
@@ -131,10 +129,10 @@ KeccakP1600_AddBytesInLane:
     jmp .L34
     .cfi_endproc
 .LFE24:
-    #.size   KeccakP1600_AddBytesInLane, .-KeccakP1600_AddBytesInLane
+    .size   KeccakP1600_AddBytesInLane, .-KeccakP1600_AddBytesInLane
     .p2align 4,,15
     .globl  KeccakP1600_AddLanes
-    #.type   KeccakP1600_AddLanes, @function
+    .type   KeccakP1600_AddLanes, @function
 KeccakP1600_AddLanes:
 .LFB25:
     .cfi_startproc
@@ -244,10 +242,10 @@ KeccakP1600_AddLanes:
     jmp .L45
     .cfi_endproc
 .LFE25:
-    #.size   KeccakP1600_AddLanes, .-KeccakP1600_AddLanes
+    .size   KeccakP1600_AddLanes, .-KeccakP1600_AddLanes
     .p2align 4,,15
     .globl  KeccakP1600_AddByte
-    #.type   KeccakP1600_AddByte, @function
+    .type   KeccakP1600_AddByte, @function
 KeccakP1600_AddByte:
 .LFB26:
     .cfi_startproc
@@ -261,10 +259,10 @@ KeccakP1600_AddByte:
     ret
     .cfi_endproc
 .LFE26:
-    #.size   KeccakP1600_AddByte, .-KeccakP1600_AddByte
+    .size   KeccakP1600_AddByte, .-KeccakP1600_AddByte
     .p2align 4,,15
     .globl  KeccakP1600_AddBytes
-    #.type   KeccakP1600_AddBytes, @function
+    .type   KeccakP1600_AddBytes, @function
 KeccakP1600_AddBytes:
 .LFB27:
     .cfi_startproc
@@ -377,10 +375,10 @@ KeccakP1600_AddBytes:
     jmp .L66
     .cfi_endproc
 .LFE27:
-    #.size   KeccakP1600_AddBytes, .-KeccakP1600_AddBytes
+    .size   KeccakP1600_AddBytes, .-KeccakP1600_AddBytes
     .p2align 4,,15
     .globl  KeccakP1600_OverwriteBytesInLane
-    #.type   KeccakP1600_OverwriteBytesInLane, @function
+    .type   KeccakP1600_OverwriteBytesInLane, @function
 KeccakP1600_OverwriteBytesInLane:
 .LFB28:
     .cfi_startproc
@@ -430,10 +428,10 @@ KeccakP1600_OverwriteBytesInLane:
     jmp memcpy
     .cfi_endproc
 .LFE28:
-    #.size   KeccakP1600_OverwriteBytesInLane, .-KeccakP1600_OverwriteBytesInLane
+    .size   KeccakP1600_OverwriteBytesInLane, .-KeccakP1600_OverwriteBytesInLane
     .p2align 4,,15
     .globl  KeccakP1600_OverwriteLanes
-    #.type   KeccakP1600_OverwriteLanes, @function
+    .type   KeccakP1600_OverwriteLanes, @function
 KeccakP1600_OverwriteLanes:
 .LFB29:
     .cfi_startproc
@@ -478,10 +476,10 @@ KeccakP1600_OverwriteLanes:
     ret
     .cfi_endproc
 .LFE29:
-    #.size   KeccakP1600_OverwriteLanes, .-KeccakP1600_OverwriteLanes
+    .size   KeccakP1600_OverwriteLanes, .-KeccakP1600_OverwriteLanes
     .p2align 4,,15
     .globl  KeccakP1600_OverwriteBytes
-    #.type   KeccakP1600_OverwriteBytes, @function
+    .type   KeccakP1600_OverwriteBytes, @function
 KeccakP1600_OverwriteBytes:
 .LFB30:
     .cfi_startproc
@@ -695,10 +693,10 @@ KeccakP1600_OverwriteBytes:
     jmp .L110
     .cfi_endproc
 .LFE30:
-    #.size   KeccakP1600_OverwriteBytes, .-KeccakP1600_OverwriteBytes
+    .size   KeccakP1600_OverwriteBytes, .-KeccakP1600_OverwriteBytes
     .p2align 4,,15
     .globl  KeccakP1600_OverwriteWithZeroes
-    #.type   KeccakP1600_OverwriteWithZeroes, @function
+    .type   KeccakP1600_OverwriteWithZeroes, @function
 KeccakP1600_OverwriteWithZeroes:
 .LFB31:
     .cfi_startproc
@@ -774,10 +772,10 @@ KeccakP1600_OverwriteWithZeroes:
     ret
     .cfi_endproc
 .LFE31:
-    #.size   KeccakP1600_OverwriteWithZeroes, .-KeccakP1600_OverwriteWithZeroes
+    .size   KeccakP1600_OverwriteWithZeroes, .-KeccakP1600_OverwriteWithZeroes
     .p2align 4,,15
     .globl  KeccakP1600_Permute_24rounds
-    #.type   KeccakP1600_Permute_24rounds, @function
+    .type   KeccakP1600_Permute_24rounds, @function
 KeccakP1600_Permute_24rounds:
 .LFB32:
     .cfi_startproc
@@ -8217,10 +8215,10 @@ KeccakP1600_Permute_24rounds:
     ret
     .cfi_endproc
 .LFE32:
-    #.size   KeccakP1600_Permute_24rounds, .-KeccakP1600_Permute_24rounds
+    .size   KeccakP1600_Permute_24rounds, .-KeccakP1600_Permute_24rounds
     .p2align 4,,15
     .globl  KeccakP1600_Permute_12rounds
-    #.type   KeccakP1600_Permute_12rounds, @function
+    .type   KeccakP1600_Permute_12rounds, @function
 KeccakP1600_Permute_12rounds:
 .LFB33:
     .cfi_startproc
@@ -11945,10 +11943,10 @@ KeccakP1600_Permute_12rounds:
     ret
     .cfi_endproc
 .LFE33:
-    #.size   KeccakP1600_Permute_12rounds, .-KeccakP1600_Permute_12rounds
+    .size   KeccakP1600_Permute_12rounds, .-KeccakP1600_Permute_12rounds
     .p2align 4,,15
     .globl  KeccakP1600_ExtractBytesInLane
-    #.type   KeccakP1600_ExtractBytesInLane, @function
+    .type   KeccakP1600_ExtractBytesInLane, @function
 KeccakP1600_ExtractBytesInLane:
 .LFB34:
     .cfi_startproc
@@ -11991,10 +11989,10 @@ KeccakP1600_ExtractBytesInLane:
     jmp .L162
     .cfi_endproc
 .LFE34:
-    #.size   KeccakP1600_ExtractBytesInLane, .-KeccakP1600_ExtractBytesInLane
+    .size   KeccakP1600_ExtractBytesInLane, .-KeccakP1600_ExtractBytesInLane
     .p2align 4,,15
     .globl  KeccakP1600_ExtractLanes
-    #.type   KeccakP1600_ExtractLanes, @function
+    .type   KeccakP1600_ExtractLanes, @function
 KeccakP1600_ExtractLanes:
 .LFB35:
     .cfi_startproc
@@ -12038,10 +12036,10 @@ KeccakP1600_ExtractLanes:
     ret
     .cfi_endproc
 .LFE35:
-    #.size   KeccakP1600_ExtractLanes, .-KeccakP1600_ExtractLanes
+    .size   KeccakP1600_ExtractLanes, .-KeccakP1600_ExtractLanes
     .p2align 4,,15
     .globl  KeccakP1600_ExtractBytes
-    #.type   KeccakP1600_ExtractBytes, @function
+    .type   KeccakP1600_ExtractBytes, @function
 KeccakP1600_ExtractBytes:
 .LFB36:
     .cfi_startproc
@@ -12204,10 +12202,10 @@ KeccakP1600_ExtractBytes:
     jmp .L180
     .cfi_endproc
 .LFE36:
-    #.size   KeccakP1600_ExtractBytes, .-KeccakP1600_ExtractBytes
+    .size   KeccakP1600_ExtractBytes, .-KeccakP1600_ExtractBytes
     .p2align 4,,15
     .globl  KeccakP1600_ExtractAndAddBytesInLane
-    #.type   KeccakP1600_ExtractAndAddBytesInLane, @function
+    .type   KeccakP1600_ExtractAndAddBytesInLane, @function
 KeccakP1600_ExtractAndAddBytesInLane:
 .LFB37:
     .cfi_startproc
@@ -12263,10 +12261,10 @@ KeccakP1600_ExtractAndAddBytesInLane:
     jmp .L193
     .cfi_endproc
 .LFE37:
-    #.size   KeccakP1600_ExtractAndAddBytesInLane, .-KeccakP1600_ExtractAndAddBytesInLane
+    .size   KeccakP1600_ExtractAndAddBytesInLane, .-KeccakP1600_ExtractAndAddBytesInLane
     .p2align 4,,15
     .globl  KeccakP1600_ExtractAndAddLanes
-    #.type   KeccakP1600_ExtractAndAddLanes, @function
+    .type   KeccakP1600_ExtractAndAddLanes, @function
 KeccakP1600_ExtractAndAddLanes:
 .LFB38:
     .cfi_startproc
@@ -12352,10 +12350,10 @@ KeccakP1600_ExtractAndAddLanes:
     jmp .L213
     .cfi_endproc
 .LFE38:
-    #.size   KeccakP1600_ExtractAndAddLanes, .-KeccakP1600_ExtractAndAddLanes
+    .size   KeccakP1600_ExtractAndAddLanes, .-KeccakP1600_ExtractAndAddLanes
     .p2align 4,,15
     .globl  KeccakP1600_ExtractAndAddBytes
-    #.type   KeccakP1600_ExtractAndAddBytes, @function
+    .type   KeccakP1600_ExtractAndAddBytes, @function
 KeccakP1600_ExtractAndAddBytes:
 .LFB39:
     .cfi_startproc
@@ -12562,10 +12560,10 @@ KeccakP1600_ExtractAndAddBytes:
     jmp .L243
     .cfi_endproc
 .LFE39:
-    #.size   KeccakP1600_ExtractAndAddBytes, .-KeccakP1600_ExtractAndAddBytes
+    .size   KeccakP1600_ExtractAndAddBytes, .-KeccakP1600_ExtractAndAddBytes
     .p2align 4,,15
     .globl  KeccakF1600_FastLoop_Absorb
-    #.type   KeccakF1600_FastLoop_Absorb, @function
+    .type   KeccakF1600_FastLoop_Absorb, @function
 KeccakF1600_FastLoop_Absorb:
 .LFB40:
     .cfi_startproc
@@ -20324,6 +20322,6 @@ KeccakF1600_FastLoop_Absorb:
     jmp .L255
     .cfi_endproc
 .LFE40:
-    #.size   KeccakF1600_FastLoop_Absorb, .-KeccakF1600_FastLoop_Absorb
-#    .ident  "GCC: (SUSE Linux) 4.7.4"
-#    .section    .note.GNU-stack,"",@progbits
+    .size   KeccakF1600_FastLoop_Absorb, .-KeccakF1600_FastLoop_Absorb
+    .ident  "GCC: (SUSE Linux) 4.7.4"
+    .section    .note.GNU-stack,"",@progbits
diff --git a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s
index 2eea40444c356b7ec383ac27904ad9e789c1e77a..65aff45cb064fb1773bc06dea717e50cdd70c88d 100755
--- a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s
+++ b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-gas.s
@@ -22,7 +22,7 @@
     .text
 
 # conditional assembly settings
-.equ UseSIMD,    1
+.equ UseSIMD,    0
 .equ InlinePerm, 1
 
 # offsets in state
@@ -474,7 +474,7 @@
     .if     InlinePerm == 1
     mKeccakPermutation24
     .else
-    callq   KeccakP1600_Permute_24rounds
+    callq   KeccakP1600_Permute_24rounds@PLT
     .endif
     .endm
 
@@ -749,7 +749,7 @@ KeccakLaneComplementTable:
     .type   KeccakP1600_OverwriteBytes, %function
 KeccakP1600_OverwriteBytes:
     addq    arg3, arg1
-    leaq    KeccakLaneComplementTable, arg5
+    leaq    KeccakLaneComplementTable(%rip), arg5
     addq    arg3, arg5
     subq    $8, arg4
     jc      KeccakP1600_OverwriteBytes_Bytes
@@ -786,7 +786,7 @@ KeccakP1600_OverwriteBytes_Exit:
     .global KeccakP1600_OverwriteWithZeroes
     .type   KeccakP1600_OverwriteWithZeroes, %function
 KeccakP1600_OverwriteWithZeroes:
-    leaq    KeccakLaneComplementTable, arg5
+    leaq    KeccakLaneComplementTable(%rip), arg5
     subq    $8, arg2
     jc      KeccakP1600_OverwriteWithZeroes_Bytes
 KeccakP1600_OverwriteWithZeroes_LanesLoop:
@@ -821,7 +821,7 @@ KeccakP1600_OverwriteWithZeroes_Exit:
     .type   KeccakP1600_ExtractBytes, %function
 KeccakP1600_ExtractBytes:
     addq    arg3, arg1
-    leaq    KeccakLaneComplementTable, arg5
+    leaq    KeccakLaneComplementTable(%rip), arg5
     addq    arg3, arg5
     subq    $8, arg4
     jc      KeccakP1600_ExtractBytes_Bytes
@@ -859,7 +859,7 @@ KeccakP1600_ExtractBytes_Exit:
     .type   KeccakP1600_ExtractAndAddBytes, %function
 KeccakP1600_ExtractAndAddBytes:
     addq    arg4, arg1
-    leaq    KeccakLaneComplementTable, arg6
+    leaq    KeccakLaneComplementTable(%rip), arg6
     addq    arg4, arg6
     subq    $8, arg5
     jc      KeccakP1600_ExtractAndAddBytes_Bytes
@@ -980,7 +980,8 @@ KeccakP1600_Permute_Nrounds:
     movq    rT2a, _su(rpStack)
 KeccakP1600_Permute_Nrounds_Dispatch:
     shlq    $3, rT1
-    jmp     *KeccakP1600_Permute_NroundsTable-8(rT1)
+    leaq    KeccakP1600_Permute_NroundsTable-8(%rip), rT2a
+    jmp     *(rT1, rT2a)
 
 KeccakP1600_Permute_Nrounds24:
     mKeccakRound    rpState, rpStack, 0x0000000000000001, 0
@@ -1174,13 +1175,13 @@ KeccakF1600_FastLoop_Absorb_VariableLaneCountLoop:
     shlq    $3, arg4
     movq    arg3, arg2                  # data pointer
     xorq    arg3, arg3                  # offset = 0
-    callq   KeccakP1600_AddBytes   #  (void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+    callq   KeccakP1600_AddBytes@PLT    #  (void *state, const unsigned char *data, unsigned int offset, unsigned int length)
     movq    arg2, arg3                  # updated data pointer
     movq    24(%rsp), rT1a              # xor trailingBits
     xorq    rT1a, (arg1)
     popq    arg1
     pushq   arg3
-    callq   KeccakP1600_Permute_24rounds
+    callq   KeccakP1600_Permute_24rounds@PLT
     popq    arg3
     popq    arg2
     popq    arg4
diff --git a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s
index 9e3e573274c139f29f5301393330e64a3f9271e8..cf4a1b1e877e4864747521a93d60803fbe235d3e 100755
--- a/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s
+++ b/src/XKCP/lib/low/KeccakP-1600/OptimizedAsmX86-64/KeccakP-1600-x86-64-shld-gas.s
@@ -474,7 +474,7 @@
     .if     InlinePerm == 1
     mKeccakPermutation24
     .else
-    callq   KeccakP1600_Permute_24rounds
+    callq   KeccakP1600_Permute_24rounds@PLT
     .endif
     .endm
 
@@ -749,7 +749,7 @@ KeccakLaneComplementTable:
     .type   KeccakP1600_OverwriteBytes, %function
 KeccakP1600_OverwriteBytes:
     addq    arg3, arg1
-    leaq    KeccakLaneComplementTable, arg5
+    leaq    KeccakLaneComplementTable(%rip), arg5
     addq    arg3, arg5
     subq    $8, arg4
     jc      KeccakP1600_OverwriteBytes_Bytes
@@ -786,7 +786,7 @@ KeccakP1600_OverwriteBytes_Exit:
     .global KeccakP1600_OverwriteWithZeroes
     .type   KeccakP1600_OverwriteWithZeroes, %function
 KeccakP1600_OverwriteWithZeroes:
-    leaq    KeccakLaneComplementTable, arg5
+    leaq    KeccakLaneComplementTable(%rip), arg5
     subq    $8, arg2
     jc      KeccakP1600_OverwriteWithZeroes_Bytes
 KeccakP1600_OverwriteWithZeroes_LanesLoop:
@@ -821,7 +821,7 @@ KeccakP1600_OverwriteWithZeroes_Exit:
     .type   KeccakP1600_ExtractBytes, %function
 KeccakP1600_ExtractBytes:
     addq    arg3, arg1
-    leaq    KeccakLaneComplementTable, arg5
+    leaq    KeccakLaneComplementTable(%rip), arg5
     addq    arg3, arg5
     subq    $8, arg4
     jc      KeccakP1600_ExtractBytes_Bytes
@@ -859,7 +859,7 @@ KeccakP1600_ExtractBytes_Exit:
     .type   KeccakP1600_ExtractAndAddBytes, %function
 KeccakP1600_ExtractAndAddBytes:
     addq    arg4, arg1
-    leaq    KeccakLaneComplementTable, arg6
+    leaq    KeccakLaneComplementTable(%rip), arg6
     addq    arg4, arg6
     subq    $8, arg5
     jc      KeccakP1600_ExtractAndAddBytes_Bytes
@@ -980,7 +980,8 @@ KeccakP1600_Permute_Nrounds:
     movq    rT2a, _su(rpStack)
 KeccakP1600_Permute_Nrounds_Dispatch:
     shlq    $3, rT1
-    jmp     *KeccakP1600_Permute_NroundsTable-8(rT1)
+    leaq    KeccakP1600_Permute_NroundsTable-8(%rip), rT2a
+    jmp     *(rT1, rT2a)
 
 KeccakP1600_Permute_Nrounds24:
     mKeccakRound    rpState, rpStack, 0x0000000000000001, 0
@@ -1174,13 +1175,13 @@ KeccakF1600_FastLoop_Absorb_VariableLaneCountLoop:
     shlq    $3, arg4
     movq    arg3, arg2                  # data pointer
     xorq    arg3, arg3                  # offset = 0
-    callq   KeccakP1600_AddBytes   #  (void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+    callq   KeccakP1600_AddBytes@PLT    #  (void *state, const unsigned char *data, unsigned int offset, unsigned int length)
     movq    arg2, arg3                  # updated data pointer
     movq    24(%rsp), rT1a              # xor trailingBits
     xorq    rT1a, (arg1)
     popq    arg1
     pushq   arg3
-    callq   KeccakP1600_Permute_24rounds
+    callq   KeccakP1600_Permute_24rounds@PLT
     popq    arg3
     popq    arg2
     popq    arg4
diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s
index 403058ef294de23e976125af6ba24781b07d46b1..b0e6dc83048ba6abed8344483d02a7ea3a584d55 100755
--- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s
+++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-armcc.s
@@ -224,7 +224,7 @@ Xoodoo_ExtractAndAddBytes_Exit
 Xoodoo_Permute_6rounds   PROC
     vpush       {q4-q5}
     vldmia      r0, {q0-q2}
-    ldr         r1, = _rc6
+    adr         r1, _rc6
     mRound
     mRound
     mRound
@@ -260,7 +260,7 @@ _rc6
 Xoodoo_Permute_12rounds   PROC
     vpush       {q4-q5}
     vldmia      r0, {q0-q2}
-    ldr         r1, = _rc12
+    adr         r1, _rc12
     mRound
     mRound
     mRound
@@ -371,7 +371,7 @@ Xoofff_CompressFastLoop   PROC
     vld1.32     {q11}, [r1]
 Xoofff_CompressFastLoop_Loop
     vld1.32     {q0,q1}, [r2]!            ; get input
-    ldr         r1, = _rc6
+    adr         r1, _rc6b
     vld1.32     {q2}, [r2]!
     veor.32     q0, q0, q6
     veor.32     q1, q1, q7
@@ -403,7 +403,15 @@ Xoofff_CompressFastLoop_Loop
     sub         r0, r2, r6                  ; return number of bytes processed
     vpop        {q4-q7}
     pop         {r4,r5,r6,pc}
+    LTORG
     align       8
+_rc6b
+    dcq         0x00000060
+    dcq         0x0000002C
+    dcq         0x00000380
+    dcq         0x000000F0
+    dcq         0x000001A0
+    dcq         0x00000012
     ENDP
 
 ; ----------------------------------------------------------------------------
@@ -425,7 +433,7 @@ Xoofff_ExpandFastLoop_Loop
     vmov        q0, q9
     vmov        q1, q10
     vmov        q2, q11
-    ldr         r1, = _rc6
+    adr         r1, _rc6b
     mRound                                  ; permutation
     mRound
     mRound
diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s
index bd6e75667e60f22dab156e082418644f5b265002..9b523ca0030607ac4e6c4a91498e22fc462cfc76 100755
--- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s
+++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodoo-uf-armv7a-neon-le-gcc.s
@@ -230,7 +230,7 @@ Xoodoo_ExtractAndAddBytes_Exit:
 Xoodoo_Permute_6rounds:
     vpush       {q4-q5}
     vldmia      r0, {q0-q2}
-    ldr         r1, = _rc6
+    adr         r1, _rc6
     mRound
     mRound
     mRound
@@ -267,7 +267,7 @@ _rc6:
 Xoodoo_Permute_12rounds:
     vpush       {q4-q5}
     vldmia      r0, {q0-q2}
-    ldr         r1, = _rc12
+    adr         r1, _rc12
     mRound
     mRound
     mRound
@@ -380,7 +380,7 @@ Xoofff_CompressFastLoop:
     vld1.32     {q11}, [r1]
 Xoofff_CompressFastLoop_Loop:
     vld1.32     {q0,q1}, [r2]!            @ get input
-    ldr         r1, = _rc6
+    adr         r1, _rc6b
     vld1.32     {q2}, [r2]!
     veor.32     q0, q0, q6
     veor.32     q1, q1, q7
@@ -412,7 +412,15 @@ Xoofff_CompressFastLoop_Loop:
     sub         r0, r2, r6                  @ return number of bytes processed
     vpop        {q4-q7}
     pop         {r4,r5,r6,pc}
+    .ltorg
     .align  8
+_rc6b:
+    .quad          0x00000060
+    .quad          0x0000002C
+    .quad          0x00000380
+    .quad          0x000000F0
+    .quad          0x000001A0
+    .quad          0x00000012
 
 
 @ ----------------------------------------------------------------------------
@@ -435,7 +443,7 @@ Xoofff_ExpandFastLoop_Loop:
     vmov        q0, q9
     vmov        q1, q10
     vmov        q2, q11
-    ldr         r1, = _rc6
+    adr         r1, _rc6b
     mRound                                  @ permutation
     mRound
     mRound
diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s
index df5c5d31aaaa765ac8e477d9bf5ffa7c7dcc5b24..b4937f0af08e9c6264a3560d564de008dd1a8ac7 100755
--- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s
+++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-armcc.s
@@ -64,7 +64,7 @@ Xoodyak_Rhash   equ     16
 ;
     align   8
 Xoodoo_Permute_12roundsAsm   PROC
-    ldr         r1, =_rc12
+    adr         r1, _rc12
     mRound
     mRound
     mRound
diff --git a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s
index 2b9b018f057b36ed1a85b07dd71012bad6e6182c..d701e08b1c9b029524a9faa71a303c0b65c95090 100755
--- a/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s
+++ b/src/XKCP/lib/low/Xoodoo/OptimizedAsmARMv7A/Xoodyak-uf-armv7a-neon-le-gcc.s
@@ -64,7 +64,7 @@
     .align  8
 .type	Xoodoo_Permute_12roundsAsm, %function;
 Xoodoo_Permute_12roundsAsm:
-    ldr         r1, =_rc12
+    adr         r1, _rc12
     mRound
     mRound
     mRound
diff --git a/src/XKCP/tests/UnitTests/timing.h b/src/XKCP/tests/UnitTests/timing.h
index 2fdd0c141389272e7a51de54120517f2463efcb2..eed7291989f370aaea701be158ddd54f4d9a883b 100755
--- a/src/XKCP/tests/UnitTests/timing.h
+++ b/src/XKCP/tests/UnitTests/timing.h
@@ -80,7 +80,7 @@ static uint_32t HiResTime(void)           /* return the current value of time st
         uint32_t cycle_count;
         asm volatile("MRS %0, pmevcntr0_el0" : "=r" (cycle_count));
         return cycle_count;
-    #elif  defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A_)
+    #elif  defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
     /* Implement Aarch32 bits on PMU : Tested on Cortex a-53 */
         uint32_t cycle_count;
         asm volatile("MRC p15, 0, %0, c9, c13, 0 \t\n" : "=r"(cycle_count));
@@ -115,10 +115,10 @@ static uint_32t calibrate()
             uint32_t r = 0;
             asm volatile("mrs %0, pmcntenset_el0" : "=r" (r));
             asm volatile("msr pmcntenset_el0, %0" : : "r" (r|1));
-        #elif  defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A_)
+        #elif  defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
         /* Implement Aarch32 bits on PMU : Tested on Cortex a-53 */
             /* Enable counters in Control Register and reset cycle count and event count */
-            printf("PMU32 Enable.. \n");
+            printf("PMU32 Enabled... \n");
             asm volatile("MCR   p15, 0, %0, c9, c12, 0" : : "r"(0x00000007));
             /* Event counter selection register, which counter to access */
             asm volatile("MCR   p15, 0, %0, c9, c12, 5" : : "r"(0x0));