diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer128f/field.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer128f/field.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer128f/field.c	2024-05-02 05:21:30.822316398 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer128f/field.c	2024-05-02 05:21:46.370685161 -0500
@@ -84,12 +84,12 @@
 {
   __m128i x[2], y, z[2], zhi[2], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x87);
-  y = _mm_load_si128((const __m128i*)b);
+  y = _mm_loadu_si128((const __m128i*)b);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    x[0] = _mm_load_si128((const __m128i*)a[party]);
-    x[1] = _mm_load_si128((const __m128i*)a[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
 
     // polynomial multiplication x2
     t[0] = _mm_clmulepi64_si128(x[0], y, 0x01);
@@ -126,8 +126,8 @@
     z[0] = _mm_xor_si128(z[0], t[0]);
     z[1] = _mm_xor_si128(z[1], t[1]);
 
-    _mm_store_si128((__m128i*)c[party], z[0]);
-    _mm_store_si128((__m128i*)c[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)c[party], z[0]);
+    _mm_storeu_si128((__m128i*)c[party + 1], z[1]);
   }
 }
 
@@ -171,12 +171,12 @@
 {
   __m128i x[2], y, z[2], zhi[2], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x87);
-  y = _mm_load_si128((const __m128i*)b);
+  y = _mm_loadu_si128((const __m128i*)b);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    x[0] = _mm_load_si128((const __m128i*)a[party]);
-    x[1] = _mm_load_si128((const __m128i*)a[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
 
     // polynomial multiplication x2
     t[0] = _mm_clmulepi64_si128(x[0], y, 0x01);
@@ -198,8 +198,8 @@
     zhi[1] = _mm_xor_si128(zhi[1], _mm_srli_si128(t[1], 8));
 
     // load c
-    x[0] = _mm_load_si128((const __m128i*)c[party]);
-    x[1] = _mm_load_si128((const __m128i*)c[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)c[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)c[party + 1]);
 
     // modular reduction x2
     t[0] = _mm_clmulepi64_si128(zhi[0], irr, 0x01);
@@ -219,8 +219,8 @@
     z[0] = _mm_xor_si128(z[0], t[0]);
     z[1] = _mm_xor_si128(z[1], t[1]);
 
-    _mm_store_si128((__m128i*)c[party], z[0]);
-    _mm_store_si128((__m128i*)c[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)c[party], z[0]);
+    _mm_storeu_si128((__m128i*)c[party + 1], z[1]);
   }
 }
 
@@ -246,7 +246,7 @@
   temp_c[0] = _mm_xor_si128(temp_c[0], _mm_clmulepi64_si128(temp[2], irr, 0x00));
   temp_c[0] = _mm_xor_si128(temp_c[0], temp[1]);
 
-  _mm_store_si128((__m128i*)c, temp_c[0]);
+  _mm_storeu_si128((__m128i*)c, temp_c[0]);
 }
 
 void GF_sqr_N(const GF a[AIMER_NUM_MPC_PARTIES],
@@ -257,10 +257,10 @@
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 4)
   {
-    t[0] = _mm_load_si128((const __m128i*)a[party]);
-    t[1] = _mm_load_si128((const __m128i*)a[party + 1]);
-    t[2] = _mm_load_si128((const __m128i*)a[party + 2]);
-    t[3] = _mm_load_si128((const __m128i*)a[party + 3]);
+    t[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    t[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
+    t[2] = _mm_loadu_si128((const __m128i*)a[party + 2]);
+    t[3] = _mm_loadu_si128((const __m128i*)a[party + 3]);
 
     // polynomial squaring x4
     z[0] = _mm_clmulepi64_si128(t[0], t[0], 0x00);
@@ -299,10 +299,10 @@
     z[2] = _mm_xor_si128(z[2], t[2]);
     z[3] = _mm_xor_si128(z[3], t[3]);
 
-    _mm_store_si128((__m128i*)c[party], z[0]);
-    _mm_store_si128((__m128i*)c[party + 1], z[1]);
-    _mm_store_si128((__m128i*)c[party + 2], z[2]);
-    _mm_store_si128((__m128i*)c[party + 3], z[3]);
+    _mm_storeu_si128((__m128i*)c[party], z[0]);
+    _mm_storeu_si128((__m128i*)c[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)c[party + 2], z[2]);
+    _mm_storeu_si128((__m128i*)c[party + 3], z[3]);
   }
 }
 
@@ -360,10 +360,10 @@
     const __m256i shift = _mm256_set_epi64x(0, 0, 1, 1);
     __m256i a0, a1, a2, a3, c0, c1, c2, c3;
 
-    m0 = _mm256_load_si256((const __m256i*)a[party]);
-    m1 = _mm256_load_si256((const __m256i*)a[party + 2]);
-    m2 = _mm256_load_si256((const __m256i*)c[party]);
-    m3 = _mm256_load_si256((const __m256i*)c[party + 2]);
+    m0 = _mm256_loadu_si256((const __m256i*)a[party]);
+    m1 = _mm256_loadu_si256((const __m256i*)a[party + 2]);
+    m2 = _mm256_loadu_si256((const __m256i*)c[party]);
+    m3 = _mm256_loadu_si256((const __m256i*)c[party + 2]);
 
     // ai = a[party + i].hi x4
     a0 = _mm256_permute4x64_epi64(m0, 0x55);
@@ -384,7 +384,7 @@
 
     for (size_t row = 128; row > 64; row -= 2)
     {
-      t = _mm256_load_si256((const __m256i*)b[row - 2]);
+      t = _mm256_loadu_si256((const __m256i*)b[row - 2]);
 
       // mi[w] = (ai[w].msb) ^ 64
       m0 = _mm256_cmpgt_epi64(zero, a0);
@@ -408,8 +408,8 @@
       c3 = _mm256_xor_si256(c3, m3);
     }
 
-    m0 = _mm256_load_si256((const __m256i*)a[party]);
-    m1 = _mm256_load_si256((const __m256i*)a[party + 2]);
+    m0 = _mm256_loadu_si256((const __m256i*)a[party]);
+    m1 = _mm256_loadu_si256((const __m256i*)a[party + 2]);
 
     // ai = a[party + i].lo x4
     a0 = _mm256_permute4x64_epi64(m0, 0x00);
@@ -424,7 +424,7 @@
 
     for (size_t row = 64; row > 0; row -= 2)
     {
-      t = _mm256_load_si256((const __m256i*)b[row - 2]);
+      t = _mm256_loadu_si256((const __m256i*)b[row - 2]);
 
       // mi[w] = (ai[w].msb) ^ 64
       m0 = _mm256_cmpgt_epi64(zero, a0);
@@ -456,8 +456,8 @@
     c0 = _mm256_xor_si256(a0, a1);
     c2 = _mm256_xor_si256(a2, a3);
 
-    _mm256_store_si256((__m256i*)c[party], c0);
-    _mm256_store_si256((__m256i*)c[party + 2], c2);
+    _mm256_storeu_si256((__m256i*)c[party], c0);
+    _mm256_storeu_si256((__m256i*)c[party + 2], c2);
   }
 }
 
@@ -466,12 +466,12 @@
                     GF hi[AIMER_NUM_MPC_PARTIES])
 {
   __m128i x[2], y, z[2], zhi[2], t[4];
-  y = _mm_load_si128((const __m128i*)b);
+  y = _mm_loadu_si128((const __m128i*)b);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    x[0] = _mm_load_si128((const __m128i*)a[party]);
-    x[1] = _mm_load_si128((const __m128i*)a[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
 
     // polynomial multiplication x2
     t[0] = _mm_clmulepi64_si128(x[0], y, 0x01);
@@ -497,10 +497,10 @@
     zhi[0] = _mm_xor_si128(zhi[0], _mm_srli_si128(t[0], 8));
     zhi[1] = _mm_xor_si128(zhi[1], _mm_srli_si128(t[1], 8));
 
-    _mm_store_si128((__m128i*)lo[party], z[0]);
-    _mm_store_si128((__m128i*)lo[party + 1], z[1]);
-    _mm_store_si128((__m128i*)hi[party], zhi[0]);
-    _mm_store_si128((__m128i*)hi[party + 1], zhi[1]);
+    _mm_storeu_si128((__m128i*)lo[party], z[0]);
+    _mm_storeu_si128((__m128i*)lo[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)hi[party], zhi[0]);
+    _mm_storeu_si128((__m128i*)hi[party + 1], zhi[1]);
   }
 }
 
@@ -512,15 +512,15 @@
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 4)
   {
-    z[0] = _mm_load_si128((const __m128i*)lo[party]);
-    z[1] = _mm_load_si128((const __m128i*)lo[party + 1]);
-    z[2] = _mm_load_si128((const __m128i*)lo[party + 2]);
-    z[3] = _mm_load_si128((const __m128i*)lo[party + 3]);
-
-    zhi[0] = _mm_load_si128((const __m128i*)hi[party]);
-    zhi[1] = _mm_load_si128((const __m128i*)hi[party + 1]);
-    zhi[2] = _mm_load_si128((const __m128i*)hi[party + 2]);
-    zhi[3] = _mm_load_si128((const __m128i*)hi[party + 3]);
+    z[0] = _mm_loadu_si128((const __m128i*)lo[party]);
+    z[1] = _mm_loadu_si128((const __m128i*)lo[party + 1]);
+    z[2] = _mm_loadu_si128((const __m128i*)lo[party + 2]);
+    z[3] = _mm_loadu_si128((const __m128i*)lo[party + 3]);
+
+    zhi[0] = _mm_loadu_si128((const __m128i*)hi[party]);
+    zhi[1] = _mm_loadu_si128((const __m128i*)hi[party + 1]);
+    zhi[2] = _mm_loadu_si128((const __m128i*)hi[party + 2]);
+    zhi[3] = _mm_loadu_si128((const __m128i*)hi[party + 3]);
 
     // modular reduction x4
     t[0] = _mm_clmulepi64_si128(zhi[0], irr, 0x01);
@@ -548,9 +548,9 @@
     z[2] = _mm_xor_si128(z[2], t[2]);
     z[3] = _mm_xor_si128(z[3], t[3]);
 
-    _mm_store_si128((__m128i*)lo[party], z[0]);
-    _mm_store_si128((__m128i*)lo[party + 1], z[1]);
-    _mm_store_si128((__m128i*)lo[party + 2], z[2]);
-    _mm_store_si128((__m128i*)lo[party + 3], z[3]);
+    _mm_storeu_si128((__m128i*)lo[party], z[0]);
+    _mm_storeu_si128((__m128i*)lo[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)lo[party + 2], z[2]);
+    _mm_storeu_si128((__m128i*)lo[party + 3], z[3]);
   }
 }
Only in 240402_AIMer.patched/Additional_Implementation/avx2/aimer128f: field.c.orig
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer128f/shake/KeccakP-1600-times4-SIMD256.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer128f/shake/KeccakP-1600-times4-SIMD256.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer128f/shake/KeccakP-1600-times4-SIMD256.c	2024-04-03 13:11:16.000000000 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer128f/shake/KeccakP-1600-times4-SIMD256.c	2024-05-02 05:21:46.370685161 -0500
@@ -45,9 +45,9 @@
 
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define CONST256_64(a)          _mm256_set1_epi64x(a)
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256(a)              _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
     #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
@@ -55,7 +55,7 @@
     #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256(a, b)          _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer128s/field.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer128s/field.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer128s/field.c	2024-05-02 05:21:30.822316398 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer128s/field.c	2024-05-02 05:21:46.370685161 -0500
@@ -84,12 +84,12 @@
 {
   __m128i x[2], y, z[2], zhi[2], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x87);
-  y = _mm_load_si128((const __m128i*)b);
+  y = _mm_loadu_si128((const __m128i*)b);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    x[0] = _mm_load_si128((const __m128i*)a[party]);
-    x[1] = _mm_load_si128((const __m128i*)a[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
 
     // polynomial multiplication x2
     t[0] = _mm_clmulepi64_si128(x[0], y, 0x01);
@@ -126,8 +126,8 @@
     z[0] = _mm_xor_si128(z[0], t[0]);
     z[1] = _mm_xor_si128(z[1], t[1]);
 
-    _mm_store_si128((__m128i*)c[party], z[0]);
-    _mm_store_si128((__m128i*)c[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)c[party], z[0]);
+    _mm_storeu_si128((__m128i*)c[party + 1], z[1]);
   }
 }
 
@@ -171,12 +171,12 @@
 {
   __m128i x[2], y, z[2], zhi[2], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x87);
-  y = _mm_load_si128((const __m128i*)b);
+  y = _mm_loadu_si128((const __m128i*)b);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    x[0] = _mm_load_si128((const __m128i*)a[party]);
-    x[1] = _mm_load_si128((const __m128i*)a[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
 
     // polynomial multiplication x2
     t[0] = _mm_clmulepi64_si128(x[0], y, 0x01);
@@ -198,8 +198,8 @@
     zhi[1] = _mm_xor_si128(zhi[1], _mm_srli_si128(t[1], 8));
 
     // load c
-    x[0] = _mm_load_si128((const __m128i*)c[party]);
-    x[1] = _mm_load_si128((const __m128i*)c[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)c[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)c[party + 1]);
 
     // modular reduction x2
     t[0] = _mm_clmulepi64_si128(zhi[0], irr, 0x01);
@@ -219,8 +219,8 @@
     z[0] = _mm_xor_si128(z[0], t[0]);
     z[1] = _mm_xor_si128(z[1], t[1]);
 
-    _mm_store_si128((__m128i*)c[party], z[0]);
-    _mm_store_si128((__m128i*)c[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)c[party], z[0]);
+    _mm_storeu_si128((__m128i*)c[party + 1], z[1]);
   }
 }
 
@@ -246,7 +246,7 @@
   temp_c[0] = _mm_xor_si128(temp_c[0], _mm_clmulepi64_si128(temp[2], irr, 0x00));
   temp_c[0] = _mm_xor_si128(temp_c[0], temp[1]);
 
-  _mm_store_si128((__m128i*)c, temp_c[0]);
+  _mm_storeu_si128((__m128i*)c, temp_c[0]);
 }
 
 void GF_sqr_N(const GF a[AIMER_NUM_MPC_PARTIES],
@@ -257,10 +257,10 @@
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 4)
   {
-    t[0] = _mm_load_si128((const __m128i*)a[party]);
-    t[1] = _mm_load_si128((const __m128i*)a[party + 1]);
-    t[2] = _mm_load_si128((const __m128i*)a[party + 2]);
-    t[3] = _mm_load_si128((const __m128i*)a[party + 3]);
+    t[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    t[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
+    t[2] = _mm_loadu_si128((const __m128i*)a[party + 2]);
+    t[3] = _mm_loadu_si128((const __m128i*)a[party + 3]);
 
     // polynomial squaring x4
     z[0] = _mm_clmulepi64_si128(t[0], t[0], 0x00);
@@ -299,10 +299,10 @@
     z[2] = _mm_xor_si128(z[2], t[2]);
     z[3] = _mm_xor_si128(z[3], t[3]);
 
-    _mm_store_si128((__m128i*)c[party], z[0]);
-    _mm_store_si128((__m128i*)c[party + 1], z[1]);
-    _mm_store_si128((__m128i*)c[party + 2], z[2]);
-    _mm_store_si128((__m128i*)c[party + 3], z[3]);
+    _mm_storeu_si128((__m128i*)c[party], z[0]);
+    _mm_storeu_si128((__m128i*)c[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)c[party + 2], z[2]);
+    _mm_storeu_si128((__m128i*)c[party + 3], z[3]);
   }
 }
 
@@ -360,10 +360,10 @@
     const __m256i shift = _mm256_set_epi64x(0, 0, 1, 1);
     __m256i a0, a1, a2, a3, c0, c1, c2, c3;
 
-    m0 = _mm256_load_si256((const __m256i*)a[party]);
-    m1 = _mm256_load_si256((const __m256i*)a[party + 2]);
-    m2 = _mm256_load_si256((const __m256i*)c[party]);
-    m3 = _mm256_load_si256((const __m256i*)c[party + 2]);
+    m0 = _mm256_loadu_si256((const __m256i*)a[party]);
+    m1 = _mm256_loadu_si256((const __m256i*)a[party + 2]);
+    m2 = _mm256_loadu_si256((const __m256i*)c[party]);
+    m3 = _mm256_loadu_si256((const __m256i*)c[party + 2]);
 
     // ai = a[party + i].hi x4
     a0 = _mm256_permute4x64_epi64(m0, 0x55);
@@ -384,7 +384,7 @@
 
     for (size_t row = 128; row > 64; row -= 2)
     {
-      t = _mm256_load_si256((const __m256i*)b[row - 2]);
+      t = _mm256_loadu_si256((const __m256i*)b[row - 2]);
 
       // mi[w] = (ai[w].msb) ^ 64
       m0 = _mm256_cmpgt_epi64(zero, a0);
@@ -408,8 +408,8 @@
       c3 = _mm256_xor_si256(c3, m3);
     }
 
-    m0 = _mm256_load_si256((const __m256i*)a[party]);
-    m1 = _mm256_load_si256((const __m256i*)a[party + 2]);
+    m0 = _mm256_loadu_si256((const __m256i*)a[party]);
+    m1 = _mm256_loadu_si256((const __m256i*)a[party + 2]);
 
     // ai = a[party + i].lo x4
     a0 = _mm256_permute4x64_epi64(m0, 0x00);
@@ -424,7 +424,7 @@
 
     for (size_t row = 64; row > 0; row -= 2)
     {
-      t = _mm256_load_si256((const __m256i*)b[row - 2]);
+      t = _mm256_loadu_si256((const __m256i*)b[row - 2]);
 
       // mi[w] = (ai[w].msb) ^ 64
       m0 = _mm256_cmpgt_epi64(zero, a0);
@@ -456,8 +456,8 @@
     c0 = _mm256_xor_si256(a0, a1);
     c2 = _mm256_xor_si256(a2, a3);
 
-    _mm256_store_si256((__m256i*)c[party], c0);
-    _mm256_store_si256((__m256i*)c[party + 2], c2);
+    _mm256_storeu_si256((__m256i*)c[party], c0);
+    _mm256_storeu_si256((__m256i*)c[party + 2], c2);
   }
 }
 
@@ -466,12 +466,12 @@
                     GF hi[AIMER_NUM_MPC_PARTIES])
 {
   __m128i x[2], y, z[2], zhi[2], t[4];
-  y = _mm_load_si128((const __m128i*)b);
+  y = _mm_loadu_si128((const __m128i*)b);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    x[0] = _mm_load_si128((const __m128i*)a[party]);
-    x[1] = _mm_load_si128((const __m128i*)a[party + 1]);
+    x[0] = _mm_loadu_si128((const __m128i*)a[party]);
+    x[1] = _mm_loadu_si128((const __m128i*)a[party + 1]);
 
     // polynomial multiplication x2
     t[0] = _mm_clmulepi64_si128(x[0], y, 0x01);
@@ -497,10 +497,10 @@
     zhi[0] = _mm_xor_si128(zhi[0], _mm_srli_si128(t[0], 8));
     zhi[1] = _mm_xor_si128(zhi[1], _mm_srli_si128(t[1], 8));
 
-    _mm_store_si128((__m128i*)lo[party], z[0]);
-    _mm_store_si128((__m128i*)lo[party + 1], z[1]);
-    _mm_store_si128((__m128i*)hi[party], zhi[0]);
-    _mm_store_si128((__m128i*)hi[party + 1], zhi[1]);
+    _mm_storeu_si128((__m128i*)lo[party], z[0]);
+    _mm_storeu_si128((__m128i*)lo[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)hi[party], zhi[0]);
+    _mm_storeu_si128((__m128i*)hi[party + 1], zhi[1]);
   }
 }
 
@@ -512,15 +512,15 @@
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 4)
   {
-    z[0] = _mm_load_si128((const __m128i*)lo[party]);
-    z[1] = _mm_load_si128((const __m128i*)lo[party + 1]);
-    z[2] = _mm_load_si128((const __m128i*)lo[party + 2]);
-    z[3] = _mm_load_si128((const __m128i*)lo[party + 3]);
-
-    zhi[0] = _mm_load_si128((const __m128i*)hi[party]);
-    zhi[1] = _mm_load_si128((const __m128i*)hi[party + 1]);
-    zhi[2] = _mm_load_si128((const __m128i*)hi[party + 2]);
-    zhi[3] = _mm_load_si128((const __m128i*)hi[party + 3]);
+    z[0] = _mm_loadu_si128((const __m128i*)lo[party]);
+    z[1] = _mm_loadu_si128((const __m128i*)lo[party + 1]);
+    z[2] = _mm_loadu_si128((const __m128i*)lo[party + 2]);
+    z[3] = _mm_loadu_si128((const __m128i*)lo[party + 3]);
+
+    zhi[0] = _mm_loadu_si128((const __m128i*)hi[party]);
+    zhi[1] = _mm_loadu_si128((const __m128i*)hi[party + 1]);
+    zhi[2] = _mm_loadu_si128((const __m128i*)hi[party + 2]);
+    zhi[3] = _mm_loadu_si128((const __m128i*)hi[party + 3]);
 
     // modular reduction x4
     t[0] = _mm_clmulepi64_si128(zhi[0], irr, 0x01);
@@ -548,9 +548,9 @@
     z[2] = _mm_xor_si128(z[2], t[2]);
     z[3] = _mm_xor_si128(z[3], t[3]);
 
-    _mm_store_si128((__m128i*)lo[party], z[0]);
-    _mm_store_si128((__m128i*)lo[party + 1], z[1]);
-    _mm_store_si128((__m128i*)lo[party + 2], z[2]);
-    _mm_store_si128((__m128i*)lo[party + 3], z[3]);
+    _mm_storeu_si128((__m128i*)lo[party], z[0]);
+    _mm_storeu_si128((__m128i*)lo[party + 1], z[1]);
+    _mm_storeu_si128((__m128i*)lo[party + 2], z[2]);
+    _mm_storeu_si128((__m128i*)lo[party + 3], z[3]);
   }
 }
Only in 240402_AIMer.patched/Additional_Implementation/avx2/aimer128s: field.c.orig
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer128s/shake/KeccakP-1600-times4-SIMD256.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer128s/shake/KeccakP-1600-times4-SIMD256.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer128s/shake/KeccakP-1600-times4-SIMD256.c	2024-04-03 13:11:18.000000000 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer128s/shake/KeccakP-1600-times4-SIMD256.c	2024-05-02 05:21:46.370685161 -0500
@@ -45,9 +45,9 @@
 
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define CONST256_64(a)          _mm256_set1_epi64x(a)
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256(a)              _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
     #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
@@ -55,7 +55,7 @@
     #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256(a, b)          _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer192f/shake/KeccakP-1600-times4-SIMD256.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer192f/shake/KeccakP-1600-times4-SIMD256.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer192f/shake/KeccakP-1600-times4-SIMD256.c	2024-04-03 13:11:18.000000000 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer192f/shake/KeccakP-1600-times4-SIMD256.c	2024-05-02 05:21:46.370685161 -0500
@@ -45,9 +45,9 @@
 
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define CONST256_64(a)          _mm256_set1_epi64x(a)
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256(a)              _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
     #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
@@ -55,7 +55,7 @@
     #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256(a, b)          _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer192s/shake/KeccakP-1600-times4-SIMD256.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer192s/shake/KeccakP-1600-times4-SIMD256.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer192s/shake/KeccakP-1600-times4-SIMD256.c	2024-04-03 13:11:20.000000000 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer192s/shake/KeccakP-1600-times4-SIMD256.c	2024-05-02 05:21:46.370685161 -0500
@@ -45,9 +45,9 @@
 
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define CONST256_64(a)          _mm256_set1_epi64x(a)
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256(a)              _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
     #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
@@ -55,7 +55,7 @@
     #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256(a, b)          _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer256f/field.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer256f/field.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer256f/field.c	2024-05-02 05:21:30.822316398 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer256f/field.c	2024-05-02 05:21:46.370685161 -0500
@@ -59,10 +59,10 @@
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
   // polynomial multiplication
-  x[0] = _mm_load_si128((const __m128i*)&a[0]); // a0 a1
-  x[1] = _mm_load_si128((const __m128i*)&a[2]); // a2 a3
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  x[0] = _mm_loadu_si128((const __m128i*)&a[0]); // a0 a1
+  x[1] = _mm_loadu_si128((const __m128i*)&a[2]); // a2 a3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
 
   t[0] = _mm_clmulepi64_si128(x[0], y[0], 0x10);
   t[1] = _mm_clmulepi64_si128(x[0], y[0], 0x01);
@@ -125,8 +125,8 @@
   t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
   z[0] = _mm_xor_si128(z[0], t[0]);
 
-  _mm_store_si128((__m128i*)&c[0], z[0]);
-  _mm_store_si128((__m128i*)&c[2], z[1]);
+  _mm_storeu_si128((__m128i*)&c[0], z[0]);
+  _mm_storeu_si128((__m128i*)&c[2], z[1]);
 }
 
 void GF_mul_N(const GF a[AIMER_NUM_MPC_PARTIES], const GF b,
@@ -135,15 +135,15 @@
   __m128i x[2], y[3], z[4], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
   y[2] = _mm_xor_si128(y[0], y[1]);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party++)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]); // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]); // a2 a3
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]); // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]); // a2 a3
 
     t[0] = _mm_clmulepi64_si128(x[0], y[0], 0x10);
     t[1] = _mm_clmulepi64_si128(x[0], y[0], 0x01);
@@ -205,8 +205,8 @@
     t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
     z[0] = _mm_xor_si128(z[0], t[0]);
 
-    _mm_store_si128((__m128i*)&c[party][0], z[0]);
-    _mm_store_si128((__m128i*)&c[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&c[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&c[party][2], z[1]);
   }
 }
 
@@ -216,12 +216,12 @@
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
   // polynomial multiplication
-  x[0] = _mm_load_si128((const __m128i*)&a[0]); // a0 a1
-  x[1] = _mm_load_si128((const __m128i*)&a[2]); // a2 a3
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
-  z[0] = _mm_load_si128((const __m128i*)&c[0]);
-  z[1] = _mm_load_si128((const __m128i*)&c[2]);
+  x[0] = _mm_loadu_si128((const __m128i*)&a[0]); // a0 a1
+  x[1] = _mm_loadu_si128((const __m128i*)&a[2]); // a2 a3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
+  z[0] = _mm_loadu_si128((const __m128i*)&c[0]);
+  z[1] = _mm_loadu_si128((const __m128i*)&c[2]);
   z[2] = _mm_setzero_si128();
   z[3] = _mm_setzero_si128();
 
@@ -294,8 +294,8 @@
   t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
   z[0] = _mm_xor_si128(z[0], t[0]);
 
-  _mm_store_si128((__m128i*)&c[0], z[0]);
-  _mm_store_si128((__m128i*)&c[2], z[1]);
+  _mm_storeu_si128((__m128i*)&c[0], z[0]);
+  _mm_storeu_si128((__m128i*)&c[2], z[1]);
 }
 
 void GF_mul_add_N(const GF a[AIMER_NUM_MPC_PARTIES], const GF b,
@@ -304,17 +304,17 @@
   __m128i x[2], y[3], z[4], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
   y[2] = _mm_xor_si128(y[0], y[1]);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party++)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]); // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]); // a2 a3
-    z[0] = _mm_load_si128((const __m128i*)&c[party][0]);
-    z[1] = _mm_load_si128((const __m128i*)&c[party][2]);
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]); // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]); // a2 a3
+    z[0] = _mm_loadu_si128((const __m128i*)&c[party][0]);
+    z[1] = _mm_loadu_si128((const __m128i*)&c[party][2]);
     z[2] = _mm_setzero_si128();
     z[3] = _mm_setzero_si128();
 
@@ -386,8 +386,8 @@
     t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
     z[0] = _mm_xor_si128(z[0], t[0]);
 
-    _mm_store_si128((__m128i*)&c[party][0], z[0]);
-    _mm_store_si128((__m128i*)&c[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&c[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&c[party][2], z[1]);
   }
 }
 
@@ -397,8 +397,8 @@
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
   // polynomial multiplication
-  x[0] = _mm_load_si128((const __m128i*)&a[0]); // a0 a1
-  x[1] = _mm_load_si128((const __m128i*)&a[2]); // a2 a3
+  x[0] = _mm_loadu_si128((const __m128i*)&a[0]); // a0 a1
+  x[1] = _mm_loadu_si128((const __m128i*)&a[2]); // a2 a3
 
   z[0] = _mm_clmulepi64_si128(x[0], x[0], 0x00);
   z[1] = _mm_clmulepi64_si128(x[0], x[0], 0x11);
@@ -419,8 +419,8 @@
   x[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
   z[0] = _mm_xor_si128(z[0], x[0]);
 
-  _mm_store_si128((__m128i*)&c[0], z[0]);
-  _mm_store_si128((__m128i*)&c[2], z[1]);
+  _mm_storeu_si128((__m128i*)&c[0], z[0]);
+  _mm_storeu_si128((__m128i*)&c[2], z[1]);
 }
 
 void GF_sqr_N(const GF a[AIMER_NUM_MPC_PARTIES],
@@ -431,10 +431,10 @@
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]);     // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]);     // a2 a3
-    x[2] = _mm_load_si128((const __m128i*)&a[party + 1][0]); // a0 a1
-    x[3] = _mm_load_si128((const __m128i*)&a[party + 1][2]); // a2 a3
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]);     // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]);     // a2 a3
+    x[2] = _mm_loadu_si128((const __m128i*)&a[party + 1][0]); // a0 a1
+    x[3] = _mm_loadu_si128((const __m128i*)&a[party + 1][2]); // a2 a3
 
     z[0] = _mm_clmulepi64_si128(x[0], x[0], 0x00);
     z[1] = _mm_clmulepi64_si128(x[0], x[0], 0x11);
@@ -472,10 +472,10 @@
     z[0] = _mm_xor_si128(z[0], x[0]);
     z[4] = _mm_xor_si128(z[4], x[1]);
 
-    _mm_store_si128((__m128i*)&c[party][0], z[0]);
-    _mm_store_si128((__m128i*)&c[party][2], z[1]);
-    _mm_store_si128((__m128i*)&c[party + 1][0], z[4]);
-    _mm_store_si128((__m128i*)&c[party + 1][2], z[5]);
+    _mm_storeu_si128((__m128i*)&c[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&c[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&c[party + 1][0], z[4]);
+    _mm_storeu_si128((__m128i*)&c[party + 1][2], z[5]);
   }
 }
 
@@ -497,10 +497,10 @@
     mask = _mm256_sllv_epi64(mask, shift);
     for (int row = 64 * (i + 1); row > 64 * i; row -= 4)
     {
-      m0 = _mm256_load_si256((const __m256i*)b[row - 4]);
-      m1 = _mm256_load_si256((const __m256i*)b[row - 3]);
-      m2 = _mm256_load_si256((const __m256i*)b[row - 2]);
-      m3 = _mm256_load_si256((const __m256i*)b[row - 1]);
+      m0 = _mm256_loadu_si256((const __m256i*)b[row - 4]);
+      m1 = _mm256_loadu_si256((const __m256i*)b[row - 3]);
+      m2 = _mm256_loadu_si256((const __m256i*)b[row - 2]);
+      m3 = _mm256_loadu_si256((const __m256i*)b[row - 1]);
 
       a0 = _mm256_permute4x64_epi64(mask, 0b00000000);
       a1 = _mm256_permute4x64_epi64(mask, 0b01010101);
@@ -523,7 +523,7 @@
   c0 = _mm256_xor_si256(c0, c1);
   c2 = _mm256_xor_si256(c2, c3);
   c0 = _mm256_xor_si256(c0, c2);
-  _mm256_store_si256((__m256i*)c, c0);
+  _mm256_storeu_si256((__m256i*)c, c0);
 }
 
 void GF_transposed_matmul_add_N(const GF a[AIMER_NUM_MPC_PARTIES],
@@ -538,9 +538,9 @@
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    c0 = _mm256_load_si256((const __m256i*)c[party]);
+    c0 = _mm256_loadu_si256((const __m256i*)c[party]);
     c1 = _mm256_setzero_si256();
-    c2 = _mm256_load_si256((const __m256i*)c[party + 1]);
+    c2 = _mm256_loadu_si256((const __m256i*)c[party + 1]);
     c3 = _mm256_setzero_si256();
 
     for (int i = 3; i >= 0; i--)
@@ -551,10 +551,10 @@
       mask2 = _mm256_sllv_epi64(mask2, shift);
       for (int row = 64 * (i + 1); row > 64 * i; row -= 4)
       {
-        m0 = _mm256_load_si256((const __m256i*)b[row - 4]);
-        m1 = _mm256_load_si256((const __m256i*)b[row - 3]);
-        m2 = _mm256_load_si256((const __m256i*)b[row - 2]);
-        m3 = _mm256_load_si256((const __m256i*)b[row - 1]);
+        m0 = _mm256_loadu_si256((const __m256i*)b[row - 4]);
+        m1 = _mm256_loadu_si256((const __m256i*)b[row - 3]);
+        m2 = _mm256_loadu_si256((const __m256i*)b[row - 2]);
+        m3 = _mm256_loadu_si256((const __m256i*)b[row - 1]);
 
         a0 = _mm256_permute4x64_epi64(mask1, 0b00000000);
         a1 = _mm256_permute4x64_epi64(mask1, 0b01010101);
@@ -602,8 +602,8 @@
     }
     c0 = _mm256_xor_si256(c0, c1);
     c2 = _mm256_xor_si256(c2, c3);
-    _mm256_store_si256((__m256i*)c[party], c0);
-    _mm256_store_si256((__m256i*)c[party + 1], c2);
+    _mm256_storeu_si256((__m256i*)c[party], c0);
+    _mm256_storeu_si256((__m256i*)c[party + 1], c2);
   }
 }
 
@@ -613,19 +613,19 @@
 {
   __m128i x[2], y[3], z[4], t[4];
 
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
   y[2] = _mm_xor_si128(y[0], y[1]);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party++)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]); // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]); // a2 a3
-    z[0] = _mm_load_si128((const __m128i*)&lo[party][0]);
-    z[1] = _mm_load_si128((const __m128i*)&lo[party][2]);
-    z[2] = _mm_load_si128((const __m128i*)&hi[party][0]);
-    z[3] = _mm_load_si128((const __m128i*)&hi[party][2]);
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]); // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]); // a2 a3
+    z[0] = _mm_loadu_si128((const __m128i*)&lo[party][0]);
+    z[1] = _mm_loadu_si128((const __m128i*)&lo[party][2]);
+    z[2] = _mm_loadu_si128((const __m128i*)&hi[party][0]);
+    z[3] = _mm_loadu_si128((const __m128i*)&hi[party][2]);
 
     // [t2 t3] = x[0] * y[0]
     t[0] = _mm_clmulepi64_si128(x[0], y[0], 0x10);
@@ -681,10 +681,10 @@
     z[1] = _mm_xor_si128(z[1], t[2]);
     z[2] = _mm_xor_si128(z[2], t[3]);
 
-    _mm_store_si128((__m128i*)&lo[party][0], z[0]);
-    _mm_store_si128((__m128i*)&lo[party][2], z[1]);
-    _mm_store_si128((__m128i*)&hi[party][0], z[2]);
-    _mm_store_si128((__m128i*)&hi[party][2], z[3]);
+    _mm_storeu_si128((__m128i*)&lo[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&lo[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&hi[party][0], z[2]);
+    _mm_storeu_si128((__m128i*)&hi[party][2], z[3]);
   }
 }
 
@@ -696,15 +696,15 @@
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
     // load
-    z[0] = _mm_load_si128((const __m128i*)&lo[party][0]);
-    z[1] = _mm_load_si128((const __m128i*)&lo[party][2]);
-    z[2] = _mm_load_si128((const __m128i*)&hi[party][0]);
-    z[3] = _mm_load_si128((const __m128i*)&hi[party][2]);
-
-    z[4] = _mm_load_si128((const __m128i*)&lo[party + 1][0]);
-    z[5] = _mm_load_si128((const __m128i*)&lo[party + 1][2]);
-    z[6] = _mm_load_si128((const __m128i*)&hi[party + 1][0]);
-    z[7] = _mm_load_si128((const __m128i*)&hi[party + 1][2]);
+    z[0] = _mm_loadu_si128((const __m128i*)&lo[party][0]);
+    z[1] = _mm_loadu_si128((const __m128i*)&lo[party][2]);
+    z[2] = _mm_loadu_si128((const __m128i*)&hi[party][0]);
+    z[3] = _mm_loadu_si128((const __m128i*)&hi[party][2]);
+
+    z[4] = _mm_loadu_si128((const __m128i*)&lo[party + 1][0]);
+    z[5] = _mm_loadu_si128((const __m128i*)&lo[party + 1][2]);
+    z[6] = _mm_loadu_si128((const __m128i*)&hi[party + 1][0]);
+    z[7] = _mm_loadu_si128((const __m128i*)&hi[party + 1][2]);
 
     // modular reduction
     x[0] = _mm_clmulepi64_si128(z[2], irr, 0x01); // 2 ^ 64
@@ -732,9 +732,9 @@
     z[0] = _mm_xor_si128(z[0], x[0]);
     z[4] = _mm_xor_si128(z[4], x[1]);
 
-    _mm_store_si128((__m128i*)&lo[party][0], z[0]);
-    _mm_store_si128((__m128i*)&lo[party][2], z[1]);
-    _mm_store_si128((__m128i*)&lo[party + 1][0], z[4]);
-    _mm_store_si128((__m128i*)&lo[party + 1][2], z[5]);
+    _mm_storeu_si128((__m128i*)&lo[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&lo[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&lo[party + 1][0], z[4]);
+    _mm_storeu_si128((__m128i*)&lo[party + 1][2], z[5]);
   }
 }
Only in 240402_AIMer.patched/Additional_Implementation/avx2/aimer256f: field.c.orig
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer256f/shake/KeccakP-1600-times4-SIMD256.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer256f/shake/KeccakP-1600-times4-SIMD256.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer256f/shake/KeccakP-1600-times4-SIMD256.c	2024-04-03 13:11:22.000000000 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer256f/shake/KeccakP-1600-times4-SIMD256.c	2024-05-02 05:21:46.370685161 -0500
@@ -45,9 +45,9 @@
 
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define CONST256_64(a)          _mm256_set1_epi64x(a)
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256(a)              _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
     #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
@@ -55,7 +55,7 @@
     #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256(a, b)          _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer256s/field.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer256s/field.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer256s/field.c	2024-05-02 05:21:30.822316398 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer256s/field.c	2024-05-02 05:21:46.370685161 -0500
@@ -59,10 +59,10 @@
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
   // polynomial multiplication
-  x[0] = _mm_load_si128((const __m128i*)&a[0]); // a0 a1
-  x[1] = _mm_load_si128((const __m128i*)&a[2]); // a2 a3
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  x[0] = _mm_loadu_si128((const __m128i*)&a[0]); // a0 a1
+  x[1] = _mm_loadu_si128((const __m128i*)&a[2]); // a2 a3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
 
   t[0] = _mm_clmulepi64_si128(x[0], y[0], 0x10);
   t[1] = _mm_clmulepi64_si128(x[0], y[0], 0x01);
@@ -125,8 +125,8 @@
   t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
   z[0] = _mm_xor_si128(z[0], t[0]);
 
-  _mm_store_si128((__m128i*)&c[0], z[0]);
-  _mm_store_si128((__m128i*)&c[2], z[1]);
+  _mm_storeu_si128((__m128i*)&c[0], z[0]);
+  _mm_storeu_si128((__m128i*)&c[2], z[1]);
 }
 
 void GF_mul_N(const GF a[AIMER_NUM_MPC_PARTIES], const GF b,
@@ -135,15 +135,15 @@
   __m128i x[2], y[3], z[4], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
   y[2] = _mm_xor_si128(y[0], y[1]);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party++)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]); // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]); // a2 a3
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]); // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]); // a2 a3
 
     t[0] = _mm_clmulepi64_si128(x[0], y[0], 0x10);
     t[1] = _mm_clmulepi64_si128(x[0], y[0], 0x01);
@@ -205,8 +205,8 @@
     t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
     z[0] = _mm_xor_si128(z[0], t[0]);
 
-    _mm_store_si128((__m128i*)&c[party][0], z[0]);
-    _mm_store_si128((__m128i*)&c[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&c[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&c[party][2], z[1]);
   }
 }
 
@@ -216,12 +216,12 @@
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
   // polynomial multiplication
-  x[0] = _mm_load_si128((const __m128i*)&a[0]); // a0 a1
-  x[1] = _mm_load_si128((const __m128i*)&a[2]); // a2 a3
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
-  z[0] = _mm_load_si128((const __m128i*)&c[0]);
-  z[1] = _mm_load_si128((const __m128i*)&c[2]);
+  x[0] = _mm_loadu_si128((const __m128i*)&a[0]); // a0 a1
+  x[1] = _mm_loadu_si128((const __m128i*)&a[2]); // a2 a3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
+  z[0] = _mm_loadu_si128((const __m128i*)&c[0]);
+  z[1] = _mm_loadu_si128((const __m128i*)&c[2]);
   z[2] = _mm_setzero_si128();
   z[3] = _mm_setzero_si128();
 
@@ -294,8 +294,8 @@
   t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
   z[0] = _mm_xor_si128(z[0], t[0]);
 
-  _mm_store_si128((__m128i*)&c[0], z[0]);
-  _mm_store_si128((__m128i*)&c[2], z[1]);
+  _mm_storeu_si128((__m128i*)&c[0], z[0]);
+  _mm_storeu_si128((__m128i*)&c[2], z[1]);
 }
 
 void GF_mul_add_N(const GF a[AIMER_NUM_MPC_PARTIES], const GF b,
@@ -304,17 +304,17 @@
   __m128i x[2], y[3], z[4], t[4];
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
   y[2] = _mm_xor_si128(y[0], y[1]);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party++)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]); // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]); // a2 a3
-    z[0] = _mm_load_si128((const __m128i*)&c[party][0]);
-    z[1] = _mm_load_si128((const __m128i*)&c[party][2]);
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]); // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]); // a2 a3
+    z[0] = _mm_loadu_si128((const __m128i*)&c[party][0]);
+    z[1] = _mm_loadu_si128((const __m128i*)&c[party][2]);
     z[2] = _mm_setzero_si128();
     z[3] = _mm_setzero_si128();
 
@@ -386,8 +386,8 @@
     t[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
     z[0] = _mm_xor_si128(z[0], t[0]);
 
-    _mm_store_si128((__m128i*)&c[party][0], z[0]);
-    _mm_store_si128((__m128i*)&c[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&c[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&c[party][2], z[1]);
   }
 }
 
@@ -397,8 +397,8 @@
   __m128i irr = _mm_set_epi64x(0x0, 0x425);
 
   // polynomial multiplication
-  x[0] = _mm_load_si128((const __m128i*)&a[0]); // a0 a1
-  x[1] = _mm_load_si128((const __m128i*)&a[2]); // a2 a3
+  x[0] = _mm_loadu_si128((const __m128i*)&a[0]); // a0 a1
+  x[1] = _mm_loadu_si128((const __m128i*)&a[2]); // a2 a3
 
   z[0] = _mm_clmulepi64_si128(x[0], x[0], 0x00);
   z[1] = _mm_clmulepi64_si128(x[0], x[0], 0x11);
@@ -419,8 +419,8 @@
   x[0] = _mm_clmulepi64_si128(z[2], irr, 0x00); // 2 ^ 0
   z[0] = _mm_xor_si128(z[0], x[0]);
 
-  _mm_store_si128((__m128i*)&c[0], z[0]);
-  _mm_store_si128((__m128i*)&c[2], z[1]);
+  _mm_storeu_si128((__m128i*)&c[0], z[0]);
+  _mm_storeu_si128((__m128i*)&c[2], z[1]);
 }
 
 void GF_sqr_N(const GF a[AIMER_NUM_MPC_PARTIES],
@@ -431,10 +431,10 @@
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]);     // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]);     // a2 a3
-    x[2] = _mm_load_si128((const __m128i*)&a[party + 1][0]); // a0 a1
-    x[3] = _mm_load_si128((const __m128i*)&a[party + 1][2]); // a2 a3
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]);     // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]);     // a2 a3
+    x[2] = _mm_loadu_si128((const __m128i*)&a[party + 1][0]); // a0 a1
+    x[3] = _mm_loadu_si128((const __m128i*)&a[party + 1][2]); // a2 a3
 
     z[0] = _mm_clmulepi64_si128(x[0], x[0], 0x00);
     z[1] = _mm_clmulepi64_si128(x[0], x[0], 0x11);
@@ -472,10 +472,10 @@
     z[0] = _mm_xor_si128(z[0], x[0]);
     z[4] = _mm_xor_si128(z[4], x[1]);
 
-    _mm_store_si128((__m128i*)&c[party][0], z[0]);
-    _mm_store_si128((__m128i*)&c[party][2], z[1]);
-    _mm_store_si128((__m128i*)&c[party + 1][0], z[4]);
-    _mm_store_si128((__m128i*)&c[party + 1][2], z[5]);
+    _mm_storeu_si128((__m128i*)&c[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&c[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&c[party + 1][0], z[4]);
+    _mm_storeu_si128((__m128i*)&c[party + 1][2], z[5]);
   }
 }
 
@@ -497,10 +497,10 @@
     mask = _mm256_sllv_epi64(mask, shift);
     for (int row = 64 * (i + 1); row > 64 * i; row -= 4)
     {
-      m0 = _mm256_load_si256((const __m256i*)b[row - 4]);
-      m1 = _mm256_load_si256((const __m256i*)b[row - 3]);
-      m2 = _mm256_load_si256((const __m256i*)b[row - 2]);
-      m3 = _mm256_load_si256((const __m256i*)b[row - 1]);
+      m0 = _mm256_loadu_si256((const __m256i*)b[row - 4]);
+      m1 = _mm256_loadu_si256((const __m256i*)b[row - 3]);
+      m2 = _mm256_loadu_si256((const __m256i*)b[row - 2]);
+      m3 = _mm256_loadu_si256((const __m256i*)b[row - 1]);
 
       a0 = _mm256_permute4x64_epi64(mask, 0b00000000);
       a1 = _mm256_permute4x64_epi64(mask, 0b01010101);
@@ -523,7 +523,7 @@
   c0 = _mm256_xor_si256(c0, c1);
   c2 = _mm256_xor_si256(c2, c3);
   c0 = _mm256_xor_si256(c0, c2);
-  _mm256_store_si256((__m256i*)c, c0);
+  _mm256_storeu_si256((__m256i*)c, c0);
 }
 
 void GF_transposed_matmul_add_N(const GF a[AIMER_NUM_MPC_PARTIES],
@@ -538,9 +538,9 @@
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
-    c0 = _mm256_load_si256((const __m256i*)c[party]);
+    c0 = _mm256_loadu_si256((const __m256i*)c[party]);
     c1 = _mm256_setzero_si256();
-    c2 = _mm256_load_si256((const __m256i*)c[party + 1]);
+    c2 = _mm256_loadu_si256((const __m256i*)c[party + 1]);
     c3 = _mm256_setzero_si256();
 
     for (int i = 3; i >= 0; i--)
@@ -551,10 +551,10 @@
       mask2 = _mm256_sllv_epi64(mask2, shift);
       for (int row = 64 * (i + 1); row > 64 * i; row -= 4)
       {
-        m0 = _mm256_load_si256((const __m256i*)b[row - 4]);
-        m1 = _mm256_load_si256((const __m256i*)b[row - 3]);
-        m2 = _mm256_load_si256((const __m256i*)b[row - 2]);
-        m3 = _mm256_load_si256((const __m256i*)b[row - 1]);
+        m0 = _mm256_loadu_si256((const __m256i*)b[row - 4]);
+        m1 = _mm256_loadu_si256((const __m256i*)b[row - 3]);
+        m2 = _mm256_loadu_si256((const __m256i*)b[row - 2]);
+        m3 = _mm256_loadu_si256((const __m256i*)b[row - 1]);
 
         a0 = _mm256_permute4x64_epi64(mask1, 0b00000000);
         a1 = _mm256_permute4x64_epi64(mask1, 0b01010101);
@@ -602,8 +602,8 @@
     }
     c0 = _mm256_xor_si256(c0, c1);
     c2 = _mm256_xor_si256(c2, c3);
-    _mm256_store_si256((__m256i*)c[party], c0);
-    _mm256_store_si256((__m256i*)c[party + 1], c2);
+    _mm256_storeu_si256((__m256i*)c[party], c0);
+    _mm256_storeu_si256((__m256i*)c[party + 1], c2);
   }
 }
 
@@ -613,19 +613,19 @@
 {
   __m128i x[2], y[3], z[4], t[4];
 
-  y[0] = _mm_load_si128((const __m128i*)&b[0]); // b0 b1
-  y[1] = _mm_load_si128((const __m128i*)&b[2]); // b2 b3
+  y[0] = _mm_loadu_si128((const __m128i*)&b[0]); // b0 b1
+  y[1] = _mm_loadu_si128((const __m128i*)&b[2]); // b2 b3
   y[2] = _mm_xor_si128(y[0], y[1]);
 
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party++)
   {
     // polynomial multiplication
-    x[0] = _mm_load_si128((const __m128i*)&a[party][0]); // a0 a1
-    x[1] = _mm_load_si128((const __m128i*)&a[party][2]); // a2 a3
-    z[0] = _mm_load_si128((const __m128i*)&lo[party][0]);
-    z[1] = _mm_load_si128((const __m128i*)&lo[party][2]);
-    z[2] = _mm_load_si128((const __m128i*)&hi[party][0]);
-    z[3] = _mm_load_si128((const __m128i*)&hi[party][2]);
+    x[0] = _mm_loadu_si128((const __m128i*)&a[party][0]); // a0 a1
+    x[1] = _mm_loadu_si128((const __m128i*)&a[party][2]); // a2 a3
+    z[0] = _mm_loadu_si128((const __m128i*)&lo[party][0]);
+    z[1] = _mm_loadu_si128((const __m128i*)&lo[party][2]);
+    z[2] = _mm_loadu_si128((const __m128i*)&hi[party][0]);
+    z[3] = _mm_loadu_si128((const __m128i*)&hi[party][2]);
 
     // [t2 t3] = x[0] * y[0]
     t[0] = _mm_clmulepi64_si128(x[0], y[0], 0x10);
@@ -681,10 +681,10 @@
     z[1] = _mm_xor_si128(z[1], t[2]);
     z[2] = _mm_xor_si128(z[2], t[3]);
 
-    _mm_store_si128((__m128i*)&lo[party][0], z[0]);
-    _mm_store_si128((__m128i*)&lo[party][2], z[1]);
-    _mm_store_si128((__m128i*)&hi[party][0], z[2]);
-    _mm_store_si128((__m128i*)&hi[party][2], z[3]);
+    _mm_storeu_si128((__m128i*)&lo[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&lo[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&hi[party][0], z[2]);
+    _mm_storeu_si128((__m128i*)&hi[party][2], z[3]);
   }
 }
 
@@ -696,15 +696,15 @@
   for (size_t party = 0; party < AIMER_NUM_MPC_PARTIES; party += 2)
   {
     // load
-    z[0] = _mm_load_si128((const __m128i*)&lo[party][0]);
-    z[1] = _mm_load_si128((const __m128i*)&lo[party][2]);
-    z[2] = _mm_load_si128((const __m128i*)&hi[party][0]);
-    z[3] = _mm_load_si128((const __m128i*)&hi[party][2]);
-
-    z[4] = _mm_load_si128((const __m128i*)&lo[party + 1][0]);
-    z[5] = _mm_load_si128((const __m128i*)&lo[party + 1][2]);
-    z[6] = _mm_load_si128((const __m128i*)&hi[party + 1][0]);
-    z[7] = _mm_load_si128((const __m128i*)&hi[party + 1][2]);
+    z[0] = _mm_loadu_si128((const __m128i*)&lo[party][0]);
+    z[1] = _mm_loadu_si128((const __m128i*)&lo[party][2]);
+    z[2] = _mm_loadu_si128((const __m128i*)&hi[party][0]);
+    z[3] = _mm_loadu_si128((const __m128i*)&hi[party][2]);
+
+    z[4] = _mm_loadu_si128((const __m128i*)&lo[party + 1][0]);
+    z[5] = _mm_loadu_si128((const __m128i*)&lo[party + 1][2]);
+    z[6] = _mm_loadu_si128((const __m128i*)&hi[party + 1][0]);
+    z[7] = _mm_loadu_si128((const __m128i*)&hi[party + 1][2]);
 
     // modular reduction
     x[0] = _mm_clmulepi64_si128(z[2], irr, 0x01); // 2 ^ 64
@@ -732,9 +732,9 @@
     z[0] = _mm_xor_si128(z[0], x[0]);
     z[4] = _mm_xor_si128(z[4], x[1]);
 
-    _mm_store_si128((__m128i*)&lo[party][0], z[0]);
-    _mm_store_si128((__m128i*)&lo[party][2], z[1]);
-    _mm_store_si128((__m128i*)&lo[party + 1][0], z[4]);
-    _mm_store_si128((__m128i*)&lo[party + 1][2], z[5]);
+    _mm_storeu_si128((__m128i*)&lo[party][0], z[0]);
+    _mm_storeu_si128((__m128i*)&lo[party][2], z[1]);
+    _mm_storeu_si128((__m128i*)&lo[party + 1][0], z[4]);
+    _mm_storeu_si128((__m128i*)&lo[party + 1][2], z[5]);
   }
 }
Only in 240402_AIMer.patched/Additional_Implementation/avx2/aimer256s: field.c.orig
diff -ruw 240402_AIMer/Additional_Implementation/avx2/aimer256s/shake/KeccakP-1600-times4-SIMD256.c 240402_AIMer.patched/Additional_Implementation/avx2/aimer256s/shake/KeccakP-1600-times4-SIMD256.c
--- 240402_AIMer/Additional_Implementation/avx2/aimer256s/shake/KeccakP-1600-times4-SIMD256.c	2024-04-03 13:11:22.000000000 -0500
+++ 240402_AIMer.patched/Additional_Implementation/avx2/aimer256s/shake/KeccakP-1600-times4-SIMD256.c	2024-05-02 05:21:46.370685161 -0500
@@ -45,9 +45,9 @@
 
 #if defined(KeccakP1600times4_useAVX2)
     #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
+    #define CONST256(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define CONST256_64(a)          _mm256_set1_epi64x(a)
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
+    #define LOAD256(a)              _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
     #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
     #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
@@ -55,7 +55,7 @@
     #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
 static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
+    #define STORE256(a, b)          _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
     #define STORE2_128(ah, al, v)   _mm256_storeu2_m128i(&(ah), &(al), v)
     #define XOR256(a, b)            _mm256_xor_si256(a, b)
