diff -ruw 240402_AIMer/Optimized_Implementation/aimer128f/field.c 240402_AIMer.patched/Optimized_Implementation/aimer128f/field.c
--- 240402_AIMer/Optimized_Implementation/aimer128f/field.c	2024-05-02 05:21:30.822316398 -0500
+++ 240402_AIMer.patched/Optimized_Implementation/aimer128f/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 void GF_to_bytes(const GF in, uint8_t* out)
@@ -211,15 +194,36 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[4] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
 
   t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
 
Only in 240402_AIMer.patched/Optimized_Implementation/aimer128f: field.c.orig
diff -ruw 240402_AIMer/Optimized_Implementation/aimer128s/field.c 240402_AIMer.patched/Optimized_Implementation/aimer128s/field.c
--- 240402_AIMer/Optimized_Implementation/aimer128s/field.c	2024-05-02 05:21:30.822316398 -0500
+++ 240402_AIMer.patched/Optimized_Implementation/aimer128s/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 void GF_to_bytes(const GF in, uint8_t* out)
@@ -211,15 +194,36 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[4] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
 
   t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
 
Only in 240402_AIMer.patched/Optimized_Implementation/aimer128s: field.c.orig
diff -ruw 240402_AIMer/Optimized_Implementation/aimer192f/field.c 240402_AIMer.patched/Optimized_Implementation/aimer192f/field.c
--- 240402_AIMer/Optimized_Implementation/aimer192f/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Optimized_Implementation/aimer192f/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 void GF_to_bytes(const GF in, uint8_t* out)
@@ -258,17 +241,37 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[6] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
 
   t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
 
Only in 240402_AIMer.patched/Optimized_Implementation/aimer192f: field.c.orig
diff -ruw 240402_AIMer/Optimized_Implementation/aimer192s/field.c 240402_AIMer.patched/Optimized_Implementation/aimer192s/field.c
--- 240402_AIMer/Optimized_Implementation/aimer192s/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Optimized_Implementation/aimer192s/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 void GF_to_bytes(const GF in, uint8_t* out)
@@ -258,17 +241,37 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[6] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
 
   t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
 
Only in 240402_AIMer.patched/Optimized_Implementation/aimer192s: field.c.orig
diff -ruw 240402_AIMer/Optimized_Implementation/aimer256f/field.c 240402_AIMer.patched/Optimized_Implementation/aimer256f/field.c
--- 240402_AIMer/Optimized_Implementation/aimer256f/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Optimized_Implementation/aimer256f/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 void GF_to_bytes(const GF in, uint8_t* out)
@@ -319,20 +302,38 @@
   c[0] ^= (t[0] <<  2);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[8] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
-  temp[6] = SQR_LOW(a[3]);
-  temp[7] = SQR_HIGH(a[3]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
+  square64(&temp[6],&temp[7],a[3]);
 
   t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
 
Only in 240402_AIMer.patched/Optimized_Implementation/aimer256f: field.c.orig
diff -ruw 240402_AIMer/Optimized_Implementation/aimer256s/field.c 240402_AIMer.patched/Optimized_Implementation/aimer256s/field.c
--- 240402_AIMer/Optimized_Implementation/aimer256s/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Optimized_Implementation/aimer256s/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 void GF_to_bytes(const GF in, uint8_t* out)
@@ -319,20 +302,38 @@
   c[0] ^= (t[0] <<  2);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[8] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
-  temp[6] = SQR_LOW(a[3]);
-  temp[7] = SQR_HIGH(a[3]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
+  square64(&temp[6],&temp[7],a[3]);
 
   t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
 
Only in 240402_AIMer.patched/Optimized_Implementation/aimer256s: field.c.orig
diff -ruw 240402_AIMer/Reference_Implementation/aimer128f/field.c 240402_AIMer.patched/Reference_Implementation/aimer128f/field.c
--- 240402_AIMer/Reference_Implementation/aimer128f/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Reference_Implementation/aimer128f/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 unsigned GF_getbit(const GF a, unsigned i)
@@ -225,15 +208,36 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[4] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
 
   t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
 
Only in 240402_AIMer.patched/Reference_Implementation/aimer128f: field.c.orig
diff -ruw 240402_AIMer/Reference_Implementation/aimer128s/field.c 240402_AIMer.patched/Reference_Implementation/aimer128s/field.c
--- 240402_AIMer/Reference_Implementation/aimer128s/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Reference_Implementation/aimer128s/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 unsigned GF_getbit(const GF a, unsigned i)
@@ -225,15 +208,36 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[4] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
 
   t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
 
Only in 240402_AIMer.patched/Reference_Implementation/aimer128s: field.c.orig
diff -ruw 240402_AIMer/Reference_Implementation/aimer192f/field.c 240402_AIMer.patched/Reference_Implementation/aimer192f/field.c
--- 240402_AIMer/Reference_Implementation/aimer192f/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Reference_Implementation/aimer192f/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 unsigned GF_getbit(const GF a, unsigned i)
@@ -269,17 +252,37 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[6] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
 
   t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
 
Only in 240402_AIMer.patched/Reference_Implementation/aimer192f: field.c.orig
diff -ruw 240402_AIMer/Reference_Implementation/aimer192s/field.c 240402_AIMer.patched/Reference_Implementation/aimer192s/field.c
--- 240402_AIMer/Reference_Implementation/aimer192s/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Reference_Implementation/aimer192s/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 unsigned GF_getbit(const GF a, unsigned i)
@@ -269,17 +252,37 @@
   c[0] ^= (t[0] << 1);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[6] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
 
   t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
 
Only in 240402_AIMer.patched/Reference_Implementation/aimer192s: field.c.orig
diff -ruw 240402_AIMer/Reference_Implementation/aimer256f/field.c 240402_AIMer.patched/Reference_Implementation/aimer256f/field.c
--- 240402_AIMer/Reference_Implementation/aimer256f/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Reference_Implementation/aimer256f/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 unsigned GF_getbit(const GF a, unsigned i)
@@ -327,20 +310,38 @@
   c[0] ^= (t[0] <<  2);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[8] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
-  temp[6] = SQR_LOW(a[3]);
-  temp[7] = SQR_HIGH(a[3]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
+  square64(&temp[6],&temp[7],a[3]);
 
   t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
 
Only in 240402_AIMer.patched/Reference_Implementation/aimer256f: field.c.orig
diff -ruw 240402_AIMer/Reference_Implementation/aimer256s/field.c 240402_AIMer.patched/Reference_Implementation/aimer256s/field.c
--- 240402_AIMer/Reference_Implementation/aimer256s/field.c	2024-05-02 05:21:30.826316493 -0500
+++ 240402_AIMer.patched/Reference_Implementation/aimer256s/field.c	2024-05-02 05:22:48.968165936 -0500
@@ -3,23 +3,6 @@
 #include "field.h"
 #include "portable_endian.h"
 
-// square the lower 32-bit of the input
-#define SQR_LOW(x) \
-  sqr_table[((x) >> 28) & 0xf] << 56 | sqr_table[((x) >> 24) & 0xf] << 48 | \
-  sqr_table[((x) >> 20) & 0xf] << 40 | sqr_table[((x) >> 16) & 0xf] << 32 | \
-  sqr_table[((x) >> 12) & 0xf] << 24 | sqr_table[((x) >>  8) & 0xf] << 16 | \
-  sqr_table[((x) >>  4) & 0xf] <<  8 | sqr_table[((x)      ) & 0xf]
-
-// square the upper 32-bit of the input
-#define SQR_HIGH(x) \
-  sqr_table[((x) >> 60)      ] << 56 | sqr_table[((x) >> 56) & 0xf] << 48 | \
-  sqr_table[((x) >> 52) & 0xf] << 40 | sqr_table[((x) >> 48) & 0xf] << 32 | \
-  sqr_table[((x) >> 44) & 0xf] << 24 | sqr_table[((x) >> 40) & 0xf] << 16 | \
-  sqr_table[((x) >> 36) & 0xf] <<  8 | sqr_table[((x) >> 32) & 0xf]
-
-const uint64_t sqr_table[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15,
-                                0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55};
-
 void poly64_mul(const uint64_t a, const uint64_t b, uint64_t *c1, uint64_t *c0);
 
 unsigned GF_getbit(const GF a, unsigned i)
@@ -327,20 +310,38 @@
   c[0] ^= (t[0] <<  2);
 }
 
+static void square64(uint64_t *z0,uint64_t *z1,uint64_t x)
+{
+  const uint64_t C0 = 0x5555555555555555;
+  const uint64_t C1 = 0x3333333333333333;
+  const uint64_t C2 = 0x0f0f0f0f0f0f0f0f;
+  const uint64_t C3 = 0x00ff00ff00ff00ff;
+  const uint64_t C4 = 0x0000ffff0000ffff;
+  const uint64_t C5 = 0x00000000ffffffff;
+  uint64_t y = x>>32; x &= C5;
+  x = (x | (x << 16)) & C4;
+  y = (y | (y << 16)) & C4;
+  x = (x | (x << 8)) & C3;
+  y = (y | (y << 8)) & C3;
+  x = (x | (x << 4)) & C2;
+  y = (y | (y << 4)) & C2;
+  x = (x | (x << 2)) & C1;
+  y = (y | (y << 2)) & C1;
+  x = (x | (x << 1)) & C0;
+  y = (y | (y << 1)) & C0;
+  *z0 = x;
+  *z1 = y;
+}
+
 void GF_sqr(const GF a, GF c)
 {
   uint64_t t = 0;
   uint64_t temp[8] = {0,};
 
-  temp[0] = SQR_LOW(a[0]);
-  temp[1] = SQR_HIGH(a[0]);
-  temp[2] = SQR_LOW(a[1]);
-  temp[3] = SQR_HIGH(a[1]);
-
-  temp[4] = SQR_LOW(a[2]);
-  temp[5] = SQR_HIGH(a[2]);
-  temp[6] = SQR_LOW(a[3]);
-  temp[7] = SQR_HIGH(a[3]);
+  square64(&temp[0],&temp[1],a[0]);
+  square64(&temp[2],&temp[3],a[1]);
+  square64(&temp[4],&temp[5],a[2]);
+  square64(&temp[6],&temp[7],a[3]);
 
   t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
 
Only in 240402_AIMer.patched/Reference_Implementation/aimer256s: field.c.orig
