summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorXavier Hernandez <xhernandez@datalab.es>2014-08-04 20:50:31 +0200
committerVijay Bellur <vbellur@redhat.com>2014-09-11 22:44:12 -0700
commit26be0b3a9f334f33f1a6e53706045eb02983d713 (patch)
treec72f6018f60940d1da31ad50d4f2344555a959bf /xlators
parent5157914bac0da41cba8fef10f9dfaef209e6e865 (diff)
ec: Removed SSE2 dependency
This patch implements the Galois Field multiplications using pure C code without any assembler support. This makes the ec xlator portable to other architectures. In the future it will be possible to use an optimized implementation of the multiplications using architecture dependent facilities (it will be automatically detected and configured). To allow bricks with different machine word sizes to be able to work seamlessly in the same volume, the minimum fragment length to be stored in any brick has been fixed to 512 bytes. Otherwise, different implementations will corrupt the data (SSE2 used 128 bytes, while new implementation would have used 64). This patch also removes the '-msse2' option added on patch http://review.gluster.org/8395/ Change-Id: Iaf6e4ef3dcfda6c68f48f16ca46fc4fb61a215f4 BUG: 1125166 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/8413 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/ec/src/Makefile.am2
-rw-r--r--xlators/cluster/ec/src/ec-generic.c50
-rw-r--r--xlators/cluster/ec/src/ec-gf.c21717
-rw-r--r--xlators/cluster/ec/src/ec-gf.h95
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c18
-rw-r--r--xlators/cluster/ec/src/ec-method.c64
-rw-r--r--xlators/cluster/ec/src/ec-method.h9
7 files changed, 11684 insertions, 10271 deletions
diff --git a/xlators/cluster/ec/src/Makefile.am b/xlators/cluster/ec/src/Makefile.am
index cf88ae44d8e..e2a9330a944 100644
--- a/xlators/cluster/ec/src/Makefile.am
+++ b/xlators/cluster/ec/src/Makefile.am
@@ -38,7 +38,7 @@ AM_CPPFLAGS = $(GF_CPPFLAGS)
AM_CPPFLAGS += -I$(top_srcdir)/libglusterfs/src
AM_CPPFLAGS += -I$(top_srcdir)/xlators/lib/src
-AM_CFLAGS = -Wall -msse2 $(GF_CFLAGS)
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index dabea16233a..4afec3524c5 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -666,7 +666,7 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
{
ec_cbk_data_t * ans = NULL;
data_t * data = NULL;
- uint8_t * ptr = NULL, * buff = NULL, * tmp = NULL;
+ uint8_t * buff = NULL;
size_t size = 0;
int32_t i = 0;
@@ -682,7 +682,6 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
if (cbk->iatt[0].ia_type == IA_IFREG)
{
uint8_t * blocks[cbk->count];
- uint8_t * ptrs[cbk->count];
uint32_t values[cbk->count];
cbk->size = cbk->iatt[0].ia_size;
@@ -696,38 +695,23 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
if (data != NULL)
{
values[i] = ans->idx;
- ptrs[i] = GF_MALLOC(data->len + EC_BUFFER_ALIGN_SIZE - 1,
- gf_common_mt_char);
- if (ptrs[i] == NULL)
- {
- continue;
- }
-
+ blocks[i] = (uint8_t *)data->data;
if (size > data->len)
{
size = data->len;
}
- blocks[i] = GF_ALIGN_BUF(ptrs[i], EC_BUFFER_ALIGN_SIZE);
- memcpy(blocks[i], data->data, size);
-
i++;
}
}
- dict_del(cbk->xdata, GF_CONTENT_KEY);
-
if (i >= ec->fragments)
{
size -= size % ec->fragment_size;
if (size > 0)
{
- ptr = GF_MALLOC(size * ec->fragments +
- EC_BUFFER_ALIGN_SIZE - 1,
- gf_common_mt_char);
- if (ptr != NULL)
+ buff = GF_MALLOC(size * ec->fragments, gf_common_mt_char);
+ if (buff != NULL)
{
- buff = GF_ALIGN_BUF(ptr, EC_BUFFER_ALIGN_SIZE);
-
size = ec_method_decode(size, ec->fragments, values,
blocks, buff);
if (size > fop->size)
@@ -739,22 +723,15 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
size = cbk->iatt[0].ia_size;
}
- tmp = GF_MALLOC(size, gf_common_mt_char);
- if (tmp != NULL)
+ if (dict_set_bin(cbk->xdata, GF_CONTENT_KEY, buff,
+ size) != 0)
{
- memcpy(tmp, buff, size);
- if (dict_set_bin(cbk->xdata, GF_CONTENT_KEY, tmp,
- size) != 0)
- {
- GF_FREE(tmp);
-
- gf_log(fop->xl->name, GF_LOG_WARNING, "Lookup "
- "read-ahead "
- "failed");
- }
+ GF_FREE(buff);
+ buff = NULL;
+ gf_log(fop->xl->name, GF_LOG_WARNING, "Lookup "
+ "read-ahead "
+ "failed");
}
-
- GF_FREE(ptr);
}
else
{
@@ -763,9 +740,10 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
}
}
}
- while (--i > 0)
+
+ if (buff == NULL)
{
- GF_FREE(ptrs[i]);
+ dict_del(cbk->xdata, GF_CONTENT_KEY);
}
}
}
diff --git a/xlators/cluster/ec/src/ec-gf.c b/xlators/cluster/ec/src/ec-gf.c
index 03c4818c0cc..0992d905cb6 100644
--- a/xlators/cluster/ec/src/ec-gf.c
+++ b/xlators/cluster/ec/src/ec-gf.c
@@ -18,10103 +18,11628 @@
<http://www.gnu.org/licenses/>.
*/
-/*
- * File automatically generated on Thu Jan 26 12:08:19 2012
- *
- * DO NOT MODIFY
- *
- * Multiplications in a GF(2^8) with modulus 0x11D using XOR's
- *
- * 7994 total xor's
- * 31.3 average xor's per number
- * 0 xor's for the best case (01)
- * 43 xor's for the worst case (F4)
- *
- * 0 xor's: 01
- * 10 xor's: 03
- * 12 xor's: F5
- * 16 xor's: 04 05
- * 17 xor's: 9C A6
- * 18 xor's: 02 73
- * 19 xor's: 10 39
- * 20 xor's: 0B
- * 21 xor's: 0D 59 D2 E9 EC
- * 22 xor's: 12 28 61
- * 23 xor's: 08 09 44
- * 24 xor's: 0A 1D 25 55 B4
- * 25 xor's: 07 11 21 51 63 C4
- * 26 xor's: 0C 0F 13 45 54 5E 64 BD F2
- * 27 xor's: 06 1F 22 41 6B B9 C7 D1 F7
- * 28 xor's: 19 31 8C 95 B5 C1 F3
- * 29 xor's: 26 30 42 4A 4B 50 6A 88 90 A3 D8 E0 E8 F0 FD
- * 30 xor's: 14 15 20 2E 34 5D 89 99 A2 A9 B0 E5 F9
- * 31 xor's: 16 17 18 1A 1B 24 29 2B 2D 3B 57 84 85 87 8F 97 A5 EB F1 FB
- * 32 xor's: 33 36 43 47 65 67 72 75 78 79 81 83 8D 9B A8 AF B8 BB C5 CB CC CE E6 ED
- * 33 xor's: 0E 35 3D 49 4C 4D 6E 70 94 98 A0 AB B1 B2 B6 C8 C9 CD D0 D6 DC DD E3 EA F8
- * 34 xor's: 1C 1E 23 27 2C 32 40 46 5C 60 68 6F 71 7F 8A 9A AA AC B3 C2 D3 FC FF
- * 35 xor's: 3A 53 58 6D 74 7C 7D 8B 91 93 96 A1 AE C0 CA D5 DB E4 F6
- * 36 xor's: 2A 2F 38 48 4F 5B 66 6C 82 86 92 9F AD BC CF D4 DA DE E2 FA FE
- * 37 xor's: 37 3E 52 69 7B 9D B7 BE C3 C6 EE
- * 38 xor's: 3C 5A 7E 80 9E A7 BA BF D7 E7 EF
- * 39 xor's: 3F 4E 77 8E A4 D9 E1
- * 40 xor's: 76 7A
- * 41 xor's: 62
- * 42 xor's: 56 5F DF
- * 43 xor's: F4
- *
- */
-
-#include <xmmintrin.h>
+#include <inttypes.h>
+#include <string.h>
#include "ec-gf.h"
-static void gf8mul_00000000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm0, %xmm0\n"
- "\tpxor %xmm1, %xmm1\n"
- "\tpxor %xmm2, %xmm2\n"
- "\tpxor %xmm3, %xmm3\n"
- "\tpxor %xmm4, %xmm4\n"
- "\tpxor %xmm5, %xmm5\n"
- "\tpxor %xmm6, %xmm6\n"
- "\tpxor %xmm7, %xmm7\n"
- );
-}
-
-static void gf8mul_00000001(void)
-{
-}
-
-static void gf8mul_00000010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00000011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00000100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00000101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00000110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00000111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00001000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_00001001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_00001010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00001011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00001100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00001101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00001110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00001111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00010000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_00010001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_00010010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00010011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00010100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00010101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00010110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00010111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00011000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_00011001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_00011010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00011011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00011100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00011101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00011110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00011111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00100000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm5, %xmm0\n"
- );
-}
-
-static void gf8mul_00100001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- );
-}
-
-static void gf8mul_00100010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00100011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00100100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00100101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00100110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00100111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00101000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_00101001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00101010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00101011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00101100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00101101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00101110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00101111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00110000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00110001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_00110010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00110011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00110100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00110101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00110110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00110111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00111000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_00111001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_00111010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_00111011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_00111100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_00111101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_00111110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- );
-}
-
-static void gf8mul_00111111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01000000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm6, %xmm0\n"
- );
-}
-
-static void gf8mul_01000001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- );
-}
-
-static void gf8mul_01000010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01000011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01000100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01000101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01000110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01000111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01001000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_01001001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_01001010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01001011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01001100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01001101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01001110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01001111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01010000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_01010001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_01010010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01010011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01010100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01010101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01010110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01010111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01011000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_01011001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_01011010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01011011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01011100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01011101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01011110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01011111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01100000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm5, %xmm0\n"
- );
-}
-
-static void gf8mul_01100001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- );
-}
-
-static void gf8mul_01100010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01100011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01100100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01100101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01100110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01100111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01101000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_01101001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_01101010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01101011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01101100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01101101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01101110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_01101111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01110000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_01110001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_01110010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01110011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01110100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01110101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01110110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01110111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_01111000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_01111001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_01111010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01111011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01111100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_01111101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_01111110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_01111111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10000000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm7, %xmm0\n"
- );
-}
-
-static void gf8mul_10000001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- );
-}
-
-static void gf8mul_10000010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10000011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10000100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10000101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10000110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10000111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10001000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_10001001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_10001010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10001011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10001100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10001101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10001110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10001111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10010000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_10010001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_10010010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10010011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10010100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10010101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10010110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10010111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10011000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_10011001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_10011010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10011011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10011100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10011101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10011110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10011111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10100000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm5, %xmm0\n"
- );
-}
-
-static void gf8mul_10100001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- );
-}
-
-static void gf8mul_10100010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10100011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10100100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10100101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10100110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10100111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10101000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_10101001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10101010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10101011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10101100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10101101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10101110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10101111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10110000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_10110001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_10110010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10110011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10110100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10110101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10110110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10110111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10111000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_10111001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10111010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10111011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_10111100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_10111101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_10111110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_10111111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_11000000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm6, %xmm0\n"
- );
-}
-
-static void gf8mul_11000001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- );
-}
-
-static void gf8mul_11000010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11000011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11000100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11000101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11000110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11000111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11001000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_11001001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_11001010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11001011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11001100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11001101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11001110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11001111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11010000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_11010001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_11010010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11010011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11010100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11010101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11010110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11010111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11011000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_11011001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_11011010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11011011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11011100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11011101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11011110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11011111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11100000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm5, %xmm0\n"
- );
-}
-
-static void gf8mul_11100001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11100010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11100011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11100100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11100101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11100110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11100111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11101000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11101001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_11101010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11101011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11101100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11101101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11101110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11101111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11110000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm4, %xmm0\n"
- );
-}
-
-static void gf8mul_11110001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- );
-}
-
-static void gf8mul_11110010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11110011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11110100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11110101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11110110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11110111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11111000(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm3, %xmm0\n"
- );
-}
-
-static void gf8mul_11111001(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- );
-}
-
-static void gf8mul_11111010(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11111011(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm5, %xmm2\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-static void gf8mul_11111100(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm3, %xmm7\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm3, %xmm0\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm6\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm2, %xmm0\n"
- );
-}
-
-static void gf8mul_11111101(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm2\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm4\n"
- "\tpxor %xmm5, %xmm0\n"
- "\tpxor %xmm6, %xmm5\n"
- "\tpxor %xmm4, %xmm7\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm3\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm5\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- );
-}
-
-static void gf8mul_11111110(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm6\n"
- "\tpxor %xmm7, %xmm5\n"
- "\tpxor %xmm7, %xmm4\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm5, %xmm1\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm1\n"
- "\tpxor %xmm4, %xmm0\n"
- "\tpxor %xmm6, %xmm4\n"
- "\tpxor %xmm3, %xmm6\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm2\n"
- "\tpxor %xmm2, %xmm7\n"
- "\tpxor %xmm2, %xmm6\n"
- "\tpxor %xmm2, %xmm1\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm1, %xmm4\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm1, %xmm0\n"
- "\tpxor %xmm0, %xmm7\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm5\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm3\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- "\tpxor %xmm1, %xmm0\n"
- );
-}
-
-static void gf8mul_11111111(void)
-{
- __asm__ __volatile__
- (
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm6, %xmm3\n"
- "\tpxor %xmm6, %xmm2\n"
- "\tpxor %xmm6, %xmm1\n"
- "\tpxor %xmm6, %xmm0\n"
- "\tpxor %xmm5, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm5, %xmm3\n"
- "\tpxor %xmm4, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm4, %xmm2\n"
- "\tpxor %xmm3, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm3, %xmm1\n"
- "\tpxor %xmm2, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm2, %xmm0\n"
- "\tpxor %xmm1, %xmm7\n"
- "\tpxor %xmm1, %xmm5\n"
- "\tpxor %xmm1, %xmm3\n"
- "\tpxor %xmm0, %xmm6\n"
- "\tpxor %xmm0, %xmm4\n"
- "\tpxor %xmm0, %xmm2\n"
- "\tpxor %xmm7, %xmm3\n"
- "\tpxor %xmm7, %xmm1\n"
- "\tpxor %xmm7, %xmm0\n"
- "\tpxor %xmm6, %xmm7\n"
- "\tpxor %xmm5, %xmm6\n"
- "\tpxor %xmm4, %xmm5\n"
- "\tpxor %xmm3, %xmm4\n"
- "\tpxor %xmm2, %xmm3\n"
- "\tpxor %xmm1, %xmm2\n"
- "\tpxor %xmm0, %xmm1\n"
- );
-}
-
-void (* ec_gf_mul_table[256])(void) =
-{
- gf8mul_00000000,
- gf8mul_00000001,
- gf8mul_00000010,
- gf8mul_00000011,
- gf8mul_00000100,
- gf8mul_00000101,
- gf8mul_00000110,
- gf8mul_00000111,
- gf8mul_00001000,
- gf8mul_00001001,
- gf8mul_00001010,
- gf8mul_00001011,
- gf8mul_00001100,
- gf8mul_00001101,
- gf8mul_00001110,
- gf8mul_00001111,
- gf8mul_00010000,
- gf8mul_00010001,
- gf8mul_00010010,
- gf8mul_00010011,
- gf8mul_00010100,
- gf8mul_00010101,
- gf8mul_00010110,
- gf8mul_00010111,
- gf8mul_00011000,
- gf8mul_00011001,
- gf8mul_00011010,
- gf8mul_00011011,
- gf8mul_00011100,
- gf8mul_00011101,
- gf8mul_00011110,
- gf8mul_00011111,
- gf8mul_00100000,
- gf8mul_00100001,
- gf8mul_00100010,
- gf8mul_00100011,
- gf8mul_00100100,
- gf8mul_00100101,
- gf8mul_00100110,
- gf8mul_00100111,
- gf8mul_00101000,
- gf8mul_00101001,
- gf8mul_00101010,
- gf8mul_00101011,
- gf8mul_00101100,
- gf8mul_00101101,
- gf8mul_00101110,
- gf8mul_00101111,
- gf8mul_00110000,
- gf8mul_00110001,
- gf8mul_00110010,
- gf8mul_00110011,
- gf8mul_00110100,
- gf8mul_00110101,
- gf8mul_00110110,
- gf8mul_00110111,
- gf8mul_00111000,
- gf8mul_00111001,
- gf8mul_00111010,
- gf8mul_00111011,
- gf8mul_00111100,
- gf8mul_00111101,
- gf8mul_00111110,
- gf8mul_00111111,
- gf8mul_01000000,
- gf8mul_01000001,
- gf8mul_01000010,
- gf8mul_01000011,
- gf8mul_01000100,
- gf8mul_01000101,
- gf8mul_01000110,
- gf8mul_01000111,
- gf8mul_01001000,
- gf8mul_01001001,
- gf8mul_01001010,
- gf8mul_01001011,
- gf8mul_01001100,
- gf8mul_01001101,
- gf8mul_01001110,
- gf8mul_01001111,
- gf8mul_01010000,
- gf8mul_01010001,
- gf8mul_01010010,
- gf8mul_01010011,
- gf8mul_01010100,
- gf8mul_01010101,
- gf8mul_01010110,
- gf8mul_01010111,
- gf8mul_01011000,
- gf8mul_01011001,
- gf8mul_01011010,
- gf8mul_01011011,
- gf8mul_01011100,
- gf8mul_01011101,
- gf8mul_01011110,
- gf8mul_01011111,
- gf8mul_01100000,
- gf8mul_01100001,
- gf8mul_01100010,
- gf8mul_01100011,
- gf8mul_01100100,
- gf8mul_01100101,
- gf8mul_01100110,
- gf8mul_01100111,
- gf8mul_01101000,
- gf8mul_01101001,
- gf8mul_01101010,
- gf8mul_01101011,
- gf8mul_01101100,
- gf8mul_01101101,
- gf8mul_01101110,
- gf8mul_01101111,
- gf8mul_01110000,
- gf8mul_01110001,
- gf8mul_01110010,
- gf8mul_01110011,
- gf8mul_01110100,
- gf8mul_01110101,
- gf8mul_01110110,
- gf8mul_01110111,
- gf8mul_01111000,
- gf8mul_01111001,
- gf8mul_01111010,
- gf8mul_01111011,
- gf8mul_01111100,
- gf8mul_01111101,
- gf8mul_01111110,
- gf8mul_01111111,
- gf8mul_10000000,
- gf8mul_10000001,
- gf8mul_10000010,
- gf8mul_10000011,
- gf8mul_10000100,
- gf8mul_10000101,
- gf8mul_10000110,
- gf8mul_10000111,
- gf8mul_10001000,
- gf8mul_10001001,
- gf8mul_10001010,
- gf8mul_10001011,
- gf8mul_10001100,
- gf8mul_10001101,
- gf8mul_10001110,
- gf8mul_10001111,
- gf8mul_10010000,
- gf8mul_10010001,
- gf8mul_10010010,
- gf8mul_10010011,
- gf8mul_10010100,
- gf8mul_10010101,
- gf8mul_10010110,
- gf8mul_10010111,
- gf8mul_10011000,
- gf8mul_10011001,
- gf8mul_10011010,
- gf8mul_10011011,
- gf8mul_10011100,
- gf8mul_10011101,
- gf8mul_10011110,
- gf8mul_10011111,
- gf8mul_10100000,
- gf8mul_10100001,
- gf8mul_10100010,
- gf8mul_10100011,
- gf8mul_10100100,
- gf8mul_10100101,
- gf8mul_10100110,
- gf8mul_10100111,
- gf8mul_10101000,
- gf8mul_10101001,
- gf8mul_10101010,
- gf8mul_10101011,
- gf8mul_10101100,
- gf8mul_10101101,
- gf8mul_10101110,
- gf8mul_10101111,
- gf8mul_10110000,
- gf8mul_10110001,
- gf8mul_10110010,
- gf8mul_10110011,
- gf8mul_10110100,
- gf8mul_10110101,
- gf8mul_10110110,
- gf8mul_10110111,
- gf8mul_10111000,
- gf8mul_10111001,
- gf8mul_10111010,
- gf8mul_10111011,
- gf8mul_10111100,
- gf8mul_10111101,
- gf8mul_10111110,
- gf8mul_10111111,
- gf8mul_11000000,
- gf8mul_11000001,
- gf8mul_11000010,
- gf8mul_11000011,
- gf8mul_11000100,
- gf8mul_11000101,
- gf8mul_11000110,
- gf8mul_11000111,
- gf8mul_11001000,
- gf8mul_11001001,
- gf8mul_11001010,
- gf8mul_11001011,
- gf8mul_11001100,
- gf8mul_11001101,
- gf8mul_11001110,
- gf8mul_11001111,
- gf8mul_11010000,
- gf8mul_11010001,
- gf8mul_11010010,
- gf8mul_11010011,
- gf8mul_11010100,
- gf8mul_11010101,
- gf8mul_11010110,
- gf8mul_11010111,
- gf8mul_11011000,
- gf8mul_11011001,
- gf8mul_11011010,
- gf8mul_11011011,
- gf8mul_11011100,
- gf8mul_11011101,
- gf8mul_11011110,
- gf8mul_11011111,
- gf8mul_11100000,
- gf8mul_11100001,
- gf8mul_11100010,
- gf8mul_11100011,
- gf8mul_11100100,
- gf8mul_11100101,
- gf8mul_11100110,
- gf8mul_11100111,
- gf8mul_11101000,
- gf8mul_11101001,
- gf8mul_11101010,
- gf8mul_11101011,
- gf8mul_11101100,
- gf8mul_11101101,
- gf8mul_11101110,
- gf8mul_11101111,
- gf8mul_11110000,
- gf8mul_11110001,
- gf8mul_11110010,
- gf8mul_11110011,
- gf8mul_11110100,
- gf8mul_11110101,
- gf8mul_11110110,
- gf8mul_11110111,
- gf8mul_11111000,
- gf8mul_11111001,
- gf8mul_11111010,
- gf8mul_11111011,
- gf8mul_11111100,
- gf8mul_11111101,
- gf8mul_11111110,
- gf8mul_11111111
+static void gf8_muladd_00(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ memcpy(out, in, sizeof(uint64_t) * 8 * width);
+}
+
+static void gf8_muladd_01(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ out_ptr[0] ^= in_ptr[0];
+ out_ptr[width] ^= in_ptr[width];
+ out_ptr[width * 2] ^= in_ptr[width * 2];
+ out_ptr[width * 3] ^= in_ptr[width * 3];
+ out_ptr[width * 4] ^= in_ptr[width * 4];
+ out_ptr[width * 5] ^= in_ptr[width * 5];
+ out_ptr[width * 6] ^= in_ptr[width * 6];
+ out_ptr[width * 7] ^= in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_02(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in7;
+ out1 = in0;
+ out7 = in6;
+ out5 = in4;
+ out6 = in5;
+ out3 = in2 ^ in7;
+ out4 = in3 ^ in7;
+ out2 = in1 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_03(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in0 ^ in7;
+ tmp0 = in2 ^ in7;
+ out1 = in0 ^ in1;
+ out7 = in6 ^ in7;
+ out5 = in4 ^ in5;
+ out6 = in5 ^ in6;
+ out4 = in3 ^ in4 ^ in7;
+ out2 = tmp0 ^ in1;
+ out3 = tmp0 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_04(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in6;
+ out1 = in7;
+ out7 = in5;
+ out6 = in4;
+ tmp0 = in6 ^ in7;
+ out2 = in0 ^ in6;
+ out5 = in3 ^ in7;
+ out3 = tmp0 ^ in1;
+ out4 = tmp0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_05(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in0 ^ in6;
+ out1 = in1 ^ in7;
+ out7 = in5 ^ in7;
+ out6 = in4 ^ in6;
+ out2 = out0 ^ in2;
+ out3 = out1 ^ in3 ^ in6;
+ out5 = out7 ^ in3;
+ out4 = out6 ^ in2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_06(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in6 ^ in7;
+ tmp0 = in1 ^ in6;
+ out1 = in0 ^ in7;
+ out7 = in5 ^ in6;
+ out6 = in4 ^ in5;
+ out4 = in2 ^ in3 ^ in6;
+ out5 = in3 ^ in4 ^ in7;
+ out3 = tmp0 ^ in2;
+ out2 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_07(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in6;
+ tmp1 = in5 ^ in6;
+ tmp2 = in0 ^ in7;
+ tmp3 = tmp0 ^ in3;
+ out6 = tmp1 ^ in4;
+ out7 = tmp1 ^ in7;
+ out0 = tmp2 ^ in6;
+ out1 = tmp2 ^ in1;
+ out3 = tmp3 ^ in1;
+ out4 = tmp3 ^ in4;
+ out5 = out4 ^ out7 ^ in2;
+ out2 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_08(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in5;
+ out1 = in6;
+ out7 = in4;
+ out6 = in3 ^ in7;
+ out3 = in0 ^ in5 ^ in6;
+ out5 = in2 ^ in6 ^ in7;
+ out2 = in5 ^ in7;
+ out4 = out2 ^ in1 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_09(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in0 ^ in5;
+ tmp0 = in3 ^ in6;
+ out1 = in1 ^ in6;
+ out7 = in4 ^ in7;
+ out2 = in2 ^ in5 ^ in7;
+ out3 = tmp0 ^ out0;
+ out6 = tmp0 ^ in7;
+ out4 = out1 ^ out7 ^ in5;
+ out5 = out2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_0A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in5 ^ in7;
+ out1 = in0 ^ in6;
+ out7 = in4 ^ in6;
+ out2 = in1 ^ in5;
+ out6 = out0 ^ in3;
+ out3 = out0 ^ out1 ^ in2;
+ out5 = out7 ^ in2 ^ in7;
+ out4 = out2 ^ in3 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_0B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in0 ^ in6;
+ tmp2 = in4 ^ in7;
+ out0 = in0 ^ in5 ^ in7;
+ out2 = tmp0 ^ in1;
+ out1 = tmp1 ^ in1;
+ out6 = tmp1 ^ out0 ^ in3;
+ out7 = tmp2 ^ in6;
+ out4 = tmp2 ^ out6 ^ in1;
+ out3 = out6 ^ in0 ^ in2;
+ out5 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_0C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in5 ^ in6;
+ out1 = in6 ^ in7;
+ out7 = in4 ^ in5;
+ tmp0 = in1 ^ in5;
+ tmp1 = in0 ^ in7;
+ out5 = in2 ^ in3 ^ in6;
+ out6 = in3 ^ in4 ^ in7;
+ out2 = tmp1 ^ out0;
+ out4 = tmp0 ^ in2;
+ out3 = tmp0 ^ tmp1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_0D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in5;
+ tmp1 = in5 ^ in6;
+ out1 = in1 ^ in6 ^ in7;
+ out7 = tmp0 ^ in7;
+ out4 = tmp0 ^ in1 ^ in2;
+ out0 = tmp1 ^ in0;
+ tmp2 = tmp1 ^ in3;
+ out6 = tmp2 ^ out7;
+ out2 = out0 ^ in2 ^ in7;
+ out3 = out0 ^ out1 ^ in3;
+ out5 = tmp2 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_0E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in5;
+ tmp2 = in5 ^ in6;
+ out1 = in0 ^ in6 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out2 = tmp0 ^ tmp2;
+ tmp3 = tmp1 ^ in3;
+ out7 = tmp2 ^ in4;
+ out0 = tmp2 ^ in7;
+ out4 = tmp3 ^ in1 ^ in7;
+ out5 = tmp3 ^ out7;
+ out6 = out0 ^ out5 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_0F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in6 ^ in7;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp0 ^ in5;
+ out1 = tmp1 ^ in0;
+ out7 = tmp2 ^ in4;
+ out0 = tmp2 ^ in0;
+ out6 = out7 ^ in3;
+ out5 = out6 ^ in2 ^ in7;
+ tmp3 = tmp1 ^ out0 ^ in2;
+ out4 = tmp1 ^ out5;
+ out2 = tmp3 ^ in6;
+ out3 = tmp3 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_10(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in4;
+ out1 = in5;
+ out7 = in3 ^ in7;
+ tmp0 = in6 ^ in7;
+ out2 = in4 ^ in6;
+ tmp1 = out2 ^ in5;
+ out6 = tmp0 ^ in2;
+ out3 = tmp0 ^ tmp1;
+ out5 = out2 ^ out3 ^ in1;
+ out4 = tmp1 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_11(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out7 = in3;
+ out0 = in0 ^ in4;
+ out1 = in1 ^ in5;
+ out6 = in2 ^ in7;
+ out4 = in0 ^ in5 ^ in6;
+ out5 = in1 ^ in6 ^ in7;
+ out2 = in2 ^ in4 ^ in6;
+ out3 = in3 ^ in4 ^ in5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_12(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in4 ^ in7;
+ out1 = in0 ^ in5;
+ out3 = in2 ^ in4 ^ in5;
+ tmp0 = out0 ^ in6;
+ out2 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in3;
+ out6 = tmp0 ^ out3;
+ out5 = out2 ^ in5;
+ out7 = tmp1 ^ in4;
+ out4 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_13(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out7 = in3 ^ in6;
+ tmp0 = in0 ^ in5;
+ tmp1 = in4 ^ in7;
+ out6 = in2 ^ in5 ^ in7;
+ out4 = tmp0 ^ out7 ^ in7;
+ out1 = tmp0 ^ in1;
+ out0 = tmp1 ^ in0;
+ out5 = tmp1 ^ in1 ^ in6;
+ out3 = tmp1 ^ out6 ^ in3;
+ out2 = out5 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_14(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in4 ^ in6;
+ out1 = in5 ^ in7;
+ out2 = in0 ^ in4;
+ tmp0 = out0 ^ in5;
+ out7 = out1 ^ in3;
+ tmp1 = out1 ^ in2;
+ out3 = tmp0 ^ in1;
+ out6 = tmp0 ^ tmp1;
+ out4 = tmp1 ^ out2;
+ out5 = out3 ^ in3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_15(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out7 = in3 ^ in5;
+ tmp0 = in0 ^ in4;
+ out1 = in1 ^ in5 ^ in7;
+ out5 = in1 ^ in3 ^ in6;
+ out0 = tmp0 ^ in6;
+ out2 = tmp0 ^ in2;
+ out3 = out5 ^ in4 ^ in5;
+ out6 = out2 ^ in0 ^ in7;
+ out4 = tmp0 ^ out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_16(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in4 ^ in7;
+ tmp2 = in2 ^ in3 ^ in4;
+ out1 = tmp0 ^ in7;
+ out4 = tmp0 ^ tmp2;
+ out0 = tmp1 ^ in6;
+ tmp3 = tmp1 ^ in1;
+ out6 = out0 ^ in2 ^ in5;
+ out2 = tmp3 ^ in0;
+ out3 = out6 ^ in1;
+ out7 = tmp2 ^ out6;
+ out5 = tmp3 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_17(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in3 ^ in6;
+ tmp2 = tmp0 ^ in4;
+ out4 = tmp0 ^ in0 ^ in3;
+ out7 = tmp1 ^ in5;
+ tmp3 = tmp1 ^ in1;
+ out6 = tmp2 ^ in7;
+ out5 = tmp3 ^ in4;
+ out3 = tmp3 ^ out6;
+ out0 = out3 ^ out4 ^ in1;
+ out2 = out3 ^ out7 ^ in0;
+ out1 = tmp2 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_18(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in4 ^ in5;
+ out1 = in5 ^ in6;
+ tmp0 = in4 ^ in7;
+ out5 = in1 ^ in2 ^ in5;
+ out6 = in2 ^ in3 ^ in6;
+ out2 = tmp0 ^ out1;
+ out7 = tmp0 ^ in3;
+ tmp1 = tmp0 ^ in0;
+ out3 = tmp1 ^ in6;
+ out4 = tmp1 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_19(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in1 ^ in2;
+ out7 = in3 ^ in4;
+ tmp0 = in0 ^ in7;
+ out6 = in2 ^ in3;
+ out1 = in1 ^ in5 ^ in6;
+ out0 = in0 ^ in4 ^ in5;
+ out4 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in6;
+ out2 = tmp1 ^ out0 ^ in2;
+ out3 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_1A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in5;
+ tmp1 = in5 ^ in6;
+ tmp2 = tmp0 ^ in1;
+ out0 = tmp0 ^ in7;
+ out1 = tmp1 ^ in0;
+ tmp3 = tmp1 ^ in3;
+ out5 = tmp2 ^ in2;
+ out2 = tmp2 ^ in6;
+ out7 = tmp3 ^ out0;
+ out6 = tmp3 ^ in2;
+ out4 = tmp3 ^ out2 ^ in0;
+ out3 = tmp0 ^ out1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_1B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = in2 ^ in5;
+ tmp2 = in3 ^ in6;
+ out5 = tmp0 ^ in1;
+ tmp3 = tmp0 ^ in0;
+ out6 = tmp1 ^ in3;
+ out0 = tmp1 ^ tmp3 ^ in7;
+ out7 = tmp2 ^ in4;
+ tmp4 = out5 ^ in6;
+ out3 = tmp2 ^ tmp3;
+ out2 = tmp4 ^ in5;
+ out4 = tmp4 ^ out3;
+ out1 = tmp3 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_1C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in4 ^ in6;
+ tmp2 = in5 ^ in7;
+ out6 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ in5;
+ out1 = tmp2 ^ in6;
+ tmp3 = tmp2 ^ in1;
+ tmp4 = tmp2 ^ in4;
+ out2 = tmp4 ^ in0;
+ out7 = tmp4 ^ in3;
+ out5 = tmp0 ^ tmp3;
+ out3 = tmp3 ^ out2;
+ out4 = out3 ^ in2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_1D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in0 ^ in4;
+ tmp2 = in3 ^ in4;
+ tmp3 = in2 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out5 = tmp0 ^ tmp3;
+ tmp4 = tmp1 ^ in5;
+ out6 = tmp2 ^ in2;
+ out7 = tmp2 ^ in5;
+ out2 = tmp3 ^ tmp4;
+ out4 = out3 ^ out6 ^ in6;
+ out0 = tmp4 ^ in6;
+ out1 = out2 ^ out4 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_1E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in4;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in1;
+ out3 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ in5;
+ out4 = out3 ^ in3 ^ in6;
+ tmp3 = out4 ^ in7;
+ out6 = tmp3 ^ out2 ^ in4;
+ out7 = tmp1 ^ out6;
+ out0 = out7 ^ in3;
+ out1 = tmp0 ^ out0;
+ out5 = tmp3 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_1F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in6;
+ tmp1 = tmp0 ^ in5;
+ out7 = tmp1 ^ in3;
+ out0 = tmp1 ^ in0 ^ in7;
+ out6 = out7 ^ in2 ^ in6;
+ out1 = out0 ^ in1 ^ in4;
+ out4 = out0 ^ out6 ^ in1;
+ out3 = tmp0 ^ out4;
+ out2 = out4 ^ out7 ^ in7;
+ out5 = out3 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_20(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in4;
+ out0 = in3 ^ in7;
+ tmp0 = in3 ^ in4;
+ tmp1 = in6 ^ in7;
+ out2 = out0 ^ in5;
+ out4 = tmp0 ^ in5;
+ out3 = tmp0 ^ tmp1;
+ out7 = tmp1 ^ in2;
+ out6 = tmp1 ^ in1 ^ in5;
+ out5 = out2 ^ out3 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_21(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in1 ^ in4;
+ tmp0 = in4 ^ in6;
+ out4 = in3 ^ in5;
+ out7 = in2 ^ in6;
+ out0 = in0 ^ in3 ^ in7;
+ out6 = in1 ^ in5 ^ in7;
+ out3 = tmp0 ^ in7;
+ out5 = tmp0 ^ in0;
+ out2 = out4 ^ in2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_22(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in3;
+ out1 = in0 ^ in4;
+ out7 = in2 ^ in7;
+ out4 = in4 ^ in5 ^ in7;
+ out5 = in0 ^ in5 ^ in6;
+ out6 = in1 ^ in6 ^ in7;
+ out3 = in2 ^ in3 ^ in4 ^ in6;
+ out2 = in1 ^ in3 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_23(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out7 = in2;
+ out0 = in0 ^ in3;
+ out4 = in5 ^ in7;
+ out5 = in0 ^ in6;
+ out6 = in1 ^ in7;
+ out3 = in2 ^ in4 ^ in6;
+ out1 = in0 ^ in1 ^ in4;
+ out2 = out4 ^ out6 ^ in2 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_24(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in4 ^ in7;
+ tmp0 = in3 ^ in4;
+ out0 = in3 ^ in6 ^ in7;
+ out3 = tmp0 ^ in1;
+ tmp1 = out0 ^ in5;
+ out6 = tmp1 ^ out3;
+ out2 = tmp1 ^ in0;
+ out7 = tmp1 ^ in2 ^ in3;
+ out5 = out2 ^ in4;
+ out4 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_25(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in1 ^ in4;
+ tmp0 = in2 ^ in5;
+ out1 = out3 ^ in7;
+ out7 = tmp0 ^ in6;
+ out6 = out1 ^ in5;
+ out4 = out7 ^ in3 ^ in7;
+ out2 = out4 ^ in0;
+ out0 = tmp0 ^ out2;
+ out5 = out0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_26(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in3 ^ in6;
+ tmp0 = in4 ^ in7;
+ out7 = in2 ^ in5 ^ in7;
+ tmp1 = out0 ^ in0 ^ in5;
+ out1 = tmp0 ^ in0;
+ tmp2 = tmp0 ^ in6;
+ out2 = tmp1 ^ in1;
+ out5 = tmp1 ^ in7;
+ out6 = tmp2 ^ in1;
+ out4 = tmp2 ^ out7;
+ out3 = out0 ^ out6 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_27(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out7 = in2 ^ in5;
+ out0 = in0 ^ in3 ^ in6;
+ out6 = in1 ^ in4 ^ in7;
+ out4 = out7 ^ in6;
+ out2 = out0 ^ out7 ^ in1;
+ out5 = out0 ^ in7;
+ out1 = out6 ^ in0;
+ out3 = out6 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_28(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in3;
+ out1 = in4 ^ in6;
+ out0 = in3 ^ in5 ^ in7;
+ tmp0 = out1 ^ in7;
+ tmp1 = out0 ^ in4;
+ out7 = tmp0 ^ in2;
+ tmp2 = tmp0 ^ in1;
+ out3 = tmp1 ^ in0;
+ out6 = tmp1 ^ tmp2;
+ out4 = tmp2 ^ in3;
+ out5 = out3 ^ in2 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_29(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in2 ^ in3;
+ tmp0 = in1 ^ in3;
+ tmp1 = in4 ^ in6;
+ tmp2 = in0 ^ in4 ^ in7;
+ out6 = tmp0 ^ in5;
+ out4 = tmp0 ^ in6 ^ in7;
+ out1 = tmp1 ^ in1;
+ out7 = tmp1 ^ in2;
+ out3 = tmp2 ^ in5;
+ out5 = tmp2 ^ in2;
+ out0 = out3 ^ in3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_2A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in3 ^ in5;
+ tmp0 = in1 ^ in3;
+ tmp1 = in0 ^ in4;
+ out7 = in2 ^ in4 ^ in7;
+ out3 = tmp1 ^ out0 ^ in2;
+ out2 = tmp0 ^ in7;
+ out6 = tmp0 ^ in6;
+ out1 = tmp1 ^ in6;
+ out5 = tmp1 ^ out7 ^ in5;
+ out4 = out1 ^ in0 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_2B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in1 ^ in6;
+ out7 = in2 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in2 ^ in7;
+ out6 = in1 ^ in3;
+ out1 = out4 ^ in0 ^ in4;
+ out3 = tmp0 ^ out7;
+ out0 = tmp0 ^ in3;
+ out5 = tmp1 ^ in0;
+ out2 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_2C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in2 ^ in3 ^ in4;
+ tmp2 = tmp0 ^ in6;
+ out4 = tmp1 ^ in1;
+ out5 = tmp1 ^ in0 ^ in5;
+ tmp3 = tmp2 ^ in4;
+ out6 = tmp2 ^ out4;
+ out7 = tmp3 ^ in7;
+ out2 = tmp3 ^ out5;
+ out3 = out6 ^ in0;
+ out0 = tmp1 ^ out7;
+ out1 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_2D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ out4 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in0;
+ out2 = tmp1 ^ in6;
+ out5 = tmp1 ^ in4;
+ tmp2 = out2 ^ in2;
+ tmp3 = tmp2 ^ in5;
+ out0 = tmp3 ^ in7;
+ out7 = tmp3 ^ out5;
+ out6 = out4 ^ out7 ^ in6;
+ out3 = tmp2 ^ out6;
+ out1 = out0 ^ out6 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_2E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in7;
+ out0 = in3 ^ in5 ^ in6;
+ tmp1 = tmp0 ^ in0;
+ tmp2 = tmp0 ^ in2;
+ out1 = tmp1 ^ in6;
+ out4 = tmp2 ^ in1;
+ out7 = tmp2 ^ in5;
+ out3 = out0 ^ out4 ^ in0;
+ out2 = out3 ^ out7 ^ in7;
+ out6 = tmp1 ^ out2;
+ out5 = tmp1 ^ out7 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_2F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in2 ^ in5;
+ out4 = in1 ^ in2 ^ in7;
+ out6 = in1 ^ in3 ^ in4;
+ out5 = tmp0 ^ in2;
+ tmp2 = tmp0 ^ in6;
+ out7 = tmp1 ^ in4;
+ out0 = tmp2 ^ in5;
+ out2 = tmp2 ^ out4;
+ out1 = tmp2 ^ out6 ^ in7;
+ out3 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_30(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in4 ^ in5;
+ tmp0 = in3 ^ in6;
+ tmp1 = in4 ^ in7;
+ out6 = in1 ^ in2 ^ in5;
+ out3 = tmp0 ^ in5;
+ out4 = tmp0 ^ in0;
+ out7 = tmp0 ^ in2;
+ out0 = tmp1 ^ in3;
+ out2 = tmp1 ^ out3;
+ out5 = tmp1 ^ in0 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_31(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in5 ^ in6;
+ tmp0 = in4 ^ in5;
+ tmp1 = in0 ^ in3 ^ in4;
+ tmp2 = out3 ^ in2;
+ out1 = tmp0 ^ in1;
+ out0 = tmp1 ^ in7;
+ out4 = tmp1 ^ in6;
+ out6 = tmp2 ^ in1;
+ out2 = tmp2 ^ out0 ^ in0;
+ out5 = out1 ^ in0 ^ in7;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_32(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in3 ^ in4;
+ out7 = in2 ^ in3;
+ tmp0 = in5 ^ in6;
+ tmp1 = in0 ^ in7;
+ out6 = in1 ^ in2;
+ out1 = in0 ^ in4 ^ in5;
+ out2 = tmp0 ^ out0 ^ in1;
+ out3 = tmp0 ^ out7 ^ in7;
+ out4 = tmp1 ^ in6;
+ out5 = tmp1 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_33(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in0 ^ in4;
+ tmp2 = in1 ^ in5;
+ out6 = in1 ^ in2 ^ in6;
+ out7 = tmp0 ^ in7;
+ out0 = tmp1 ^ in3;
+ out1 = tmp1 ^ tmp2;
+ tmp3 = tmp2 ^ in7;
+ tmp4 = tmp2 ^ in4 ^ in6;
+ out5 = tmp3 ^ in0;
+ out3 = tmp3 ^ out6;
+ out4 = tmp4 ^ out5;
+ out2 = tmp0 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_34(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in4 ^ in5;
+ tmp2 = tmp0 ^ in1;
+ tmp3 = tmp0 ^ in6;
+ out1 = tmp1 ^ in7;
+ tmp4 = tmp1 ^ in2;
+ out5 = tmp2 ^ in0;
+ out3 = tmp2 ^ out1;
+ out0 = tmp3 ^ in7;
+ out7 = tmp3 ^ tmp4;
+ out6 = tmp4 ^ in1;
+ out2 = out3 ^ out5 ^ in3;
+ out4 = tmp4 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_35(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in6;
+ tmp1 = in5 ^ in7;
+ out7 = tmp0 ^ tmp1 ^ in3;
+ out3 = tmp1 ^ in1;
+ out1 = out3 ^ in4;
+ tmp2 = out1 ^ in7;
+ out5 = tmp2 ^ in0 ^ in3;
+ out6 = tmp0 ^ tmp2;
+ out0 = out3 ^ out5 ^ in6;
+ out4 = tmp0 ^ out0;
+ out2 = out4 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_36(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in0 ^ in2;
+ tmp0 = in1 ^ in3;
+ out0 = in3 ^ in4 ^ in6;
+ out6 = in1 ^ in2 ^ in4;
+ out5 = tmp0 ^ in0;
+ tmp1 = out5 ^ in5;
+ out2 = tmp1 ^ in4;
+ out3 = tmp1 ^ out4;
+ out1 = tmp0 ^ out2 ^ in7;
+ out7 = out3 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_37(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in4;
+ tmp2 = tmp0 ^ in6;
+ out3 = tmp0 ^ in5;
+ out4 = tmp1 ^ in0;
+ out6 = tmp2 ^ in4;
+ out1 = out3 ^ out4 ^ in7;
+ tmp3 = out4 ^ in1 ^ in3;
+ out7 = tmp3 ^ out1;
+ out2 = tmp3 ^ in5;
+ out5 = tmp1 ^ out2;
+ out0 = tmp2 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_38(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0 ^ in3;
+ tmp0 = in3 ^ in4;
+ tmp1 = in5 ^ in7;
+ tmp2 = out3 ^ in1;
+ out2 = tmp0 ^ in6;
+ out0 = tmp0 ^ tmp1;
+ out4 = tmp1 ^ tmp2;
+ out7 = out2 ^ in2;
+ out1 = out2 ^ in3 ^ in5;
+ out6 = out4 ^ in0 ^ in2;
+ out5 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_39(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0;
+ tmp0 = in1 ^ in5;
+ tmp1 = tmp0 ^ in4;
+ out1 = tmp1 ^ in6;
+ out5 = out1 ^ in0 ^ in2;
+ tmp2 = tmp0 ^ out5;
+ out2 = tmp2 ^ in0 ^ in3;
+ out7 = out2 ^ in7;
+ out6 = tmp1 ^ out7;
+ out4 = tmp2 ^ out6;
+ out0 = out4 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_3A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in0 ^ in2;
+ tmp2 = in3 ^ in4;
+ tmp3 = in1 ^ in6;
+ tmp4 = in3 ^ in7;
+ out4 = tmp0 ^ in5;
+ out5 = tmp1 ^ tmp3;
+ out3 = tmp1 ^ tmp4;
+ out0 = tmp2 ^ in5;
+ out7 = tmp2 ^ in2;
+ tmp5 = tmp3 ^ in4;
+ out2 = tmp4 ^ tmp5;
+ out1 = tmp5 ^ out4;
+ out6 = tmp0 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_3B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in6;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in3;
+ out3 = tmp1 ^ in0;
+ out6 = tmp1 ^ tmp2;
+ out2 = out6 ^ in4;
+ out7 = tmp0 ^ out2;
+ out0 = out3 ^ out7 ^ in5;
+ out5 = out0 ^ out2 ^ in7;
+ out1 = tmp2 ^ out0;
+ out4 = out1 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_3C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in2 ^ in7;
+ tmp2 = in1 ^ in6 ^ in7;
+ out2 = tmp0 ^ in4;
+ out3 = tmp0 ^ tmp2;
+ out4 = tmp1 ^ out3 ^ in5;
+ out5 = tmp2 ^ out2 ^ in2;
+ out1 = out4 ^ out5 ^ in6;
+ out0 = out1 ^ in3;
+ out7 = tmp1 ^ out0;
+ out6 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_3D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = tmp0 ^ in3;
+ out2 = tmp1 ^ in4;
+ tmp2 = out2 ^ in5;
+ out4 = tmp2 ^ in1 ^ in6;
+ out5 = out4 ^ in7;
+ out6 = out5 ^ in0;
+ out7 = out6 ^ in1;
+ out0 = tmp0 ^ out7;
+ out1 = tmp1 ^ out5;
+ out3 = tmp2 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_3E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = tmp0 ^ in4;
+ out0 = tmp1 ^ in6;
+ out7 = tmp1 ^ in2;
+ out6 = out7 ^ in1 ^ in5 ^ in7;
+ out2 = out6 ^ in0 ^ in2;
+ out4 = out0 ^ out6 ^ in0;
+ out5 = tmp0 ^ out4;
+ out3 = out5 ^ in7;
+ out1 = out3 ^ out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_3F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ out3 = tmp0 ^ in2 ^ in6;
+ tmp1 = out3 ^ in5 ^ in7;
+ out4 = tmp1 ^ in4;
+ out5 = tmp1 ^ in3;
+ out1 = out4 ^ in2;
+ out7 = out1 ^ out3 ^ in3;
+ out2 = tmp0 ^ out7 ^ in5;
+ tmp2 = out2 ^ in0;
+ out6 = tmp2 ^ in6;
+ out0 = tmp1 ^ tmp2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_40(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in3 ^ in7;
+ tmp0 = in3 ^ in4;
+ tmp1 = in6 ^ in7;
+ out4 = tmp0 ^ in2;
+ out5 = tmp0 ^ in5;
+ out0 = tmp1 ^ in2;
+ out7 = tmp1 ^ in1 ^ in5;
+ out2 = out0 ^ in4;
+ out3 = out2 ^ out5 ^ in7;
+ out6 = out3 ^ out4 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_41(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in2 ^ in3;
+ tmp0 = in5 ^ in6;
+ tmp1 = in6 ^ in7;
+ out5 = in3 ^ in4;
+ out1 = in1 ^ in3 ^ in7;
+ out6 = in0 ^ in4 ^ in5;
+ out3 = tmp0 ^ in2;
+ out7 = tmp0 ^ in1;
+ out2 = tmp1 ^ in4;
+ out0 = tmp1 ^ in0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_42(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in2 ^ in6;
+ out5 = in3 ^ in5;
+ out1 = in0 ^ in3 ^ in7;
+ out7 = in1 ^ in5 ^ in7;
+ out4 = in2 ^ in4 ^ in7;
+ out6 = in0 ^ in4 ^ in6;
+ out2 = out0 ^ in1 ^ in4;
+ out3 = out5 ^ in6 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_43(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in3;
+ out7 = in1 ^ in5;
+ out4 = in2 ^ in7;
+ out6 = in0 ^ in4;
+ out0 = in0 ^ in2 ^ in6;
+ out3 = in5 ^ in6 ^ in7;
+ out2 = in1 ^ in4 ^ in6;
+ out1 = in0 ^ in1 ^ in3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_44(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in3;
+ out0 = in2 ^ in7;
+ tmp0 = in4 ^ in7;
+ out7 = in1 ^ in6 ^ in7;
+ out6 = in0 ^ in5 ^ in6;
+ out4 = tmp0 ^ in3 ^ in6;
+ out3 = out0 ^ in1 ^ in3 ^ in5;
+ out2 = out0 ^ in0 ^ in4;
+ out5 = tmp0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_45(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in1 ^ in3;
+ out7 = in1 ^ in6;
+ out5 = in4 ^ in7;
+ out6 = in0 ^ in5;
+ out0 = in0 ^ in2 ^ in7;
+ out4 = in3 ^ in6 ^ in7;
+ out2 = out5 ^ in0;
+ out3 = out0 ^ out6 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_46(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in2;
+ out1 = in0 ^ in3;
+ out7 = in1 ^ in7;
+ out4 = in4 ^ in6;
+ out5 = in5 ^ in7;
+ out6 = in0 ^ in6;
+ out3 = in1 ^ in3 ^ in5;
+ out2 = out4 ^ out6 ^ in1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_47(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in6;
+ out7 = in1;
+ out5 = in7;
+ out6 = in0;
+ tmp0 = in0 ^ in1;
+ out3 = in1 ^ in5;
+ out0 = in0 ^ in2;
+ out1 = tmp0 ^ in3;
+ out2 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_48(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ out1 = in3 ^ in6 ^ in7;
+ out3 = tmp0 ^ in0;
+ out0 = tmp0 ^ out1 ^ in5;
+ tmp1 = out0 ^ in4;
+ out2 = tmp1 ^ in7;
+ out5 = tmp1 ^ in3;
+ out4 = out5 ^ in1;
+ out7 = tmp0 ^ out4;
+ out6 = tmp1 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_49(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0 ^ in2;
+ tmp0 = in2 ^ in5;
+ out2 = in4 ^ in5 ^ in6;
+ tmp1 = tmp0 ^ out2 ^ in3;
+ out7 = out2 ^ in1;
+ out5 = tmp1 ^ in7;
+ out4 = out5 ^ out7 ^ in6;
+ out1 = tmp0 ^ out4;
+ out6 = out1 ^ out7 ^ in0;
+ out0 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_4A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in6;
+ tmp1 = in3 ^ in7;
+ out0 = tmp0 ^ in5;
+ out3 = tmp1 ^ in0;
+ out5 = tmp1 ^ out0;
+ out4 = out0 ^ in1 ^ in4;
+ out1 = out3 ^ in6;
+ out2 = out4 ^ in7;
+ out6 = out1 ^ in4;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_4B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0 ^ in7;
+ tmp0 = in1 ^ in5;
+ tmp1 = in2 ^ in6;
+ tmp2 = out3 ^ in3;
+ out7 = tmp0 ^ in4;
+ out4 = tmp0 ^ tmp1;
+ tmp3 = tmp1 ^ in0;
+ out6 = tmp2 ^ in4;
+ out5 = tmp2 ^ tmp3;
+ out1 = tmp2 ^ in1 ^ in6;
+ out2 = out7 ^ in6 ^ in7;
+ out0 = tmp3 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_4C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in3 ^ in6;
+ tmp0 = in2 ^ in5;
+ tmp1 = out1 ^ in5 ^ in7;
+ out0 = tmp0 ^ in7;
+ tmp2 = tmp0 ^ in4;
+ out6 = tmp1 ^ in0;
+ out2 = tmp2 ^ in0;
+ out5 = tmp2 ^ in6;
+ out3 = tmp0 ^ out6 ^ in1;
+ out7 = out0 ^ out5 ^ in1;
+ out4 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_4D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in6;
+ out4 = in1 ^ in3 ^ in5;
+ tmp2 = tmp0 ^ in7;
+ out2 = tmp0 ^ in4;
+ out1 = tmp1 ^ in3;
+ out7 = tmp1 ^ in4;
+ out0 = tmp2 ^ in2;
+ out6 = tmp2 ^ in3;
+ out5 = out7 ^ in1 ^ in2;
+ out3 = tmp1 ^ out0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_4E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in2 ^ in5;
+ out7 = in1 ^ in4 ^ in7;
+ out1 = in0 ^ in3 ^ in6;
+ out5 = out0 ^ in6;
+ out4 = out7 ^ in5;
+ out3 = out1 ^ in1;
+ out6 = out1 ^ in7;
+ out2 = out4 ^ in0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_4F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in2 ^ in6;
+ out7 = in1 ^ in4;
+ out3 = in0 ^ in1 ^ in6;
+ out4 = in1 ^ in5 ^ in7;
+ out0 = in0 ^ in2 ^ in5;
+ out6 = in0 ^ in3 ^ in7;
+ out1 = out3 ^ in3;
+ out2 = out4 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_50(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in2 ^ in7;
+ tmp0 = in3 ^ in5;
+ out0 = out2 ^ in4 ^ in6;
+ out1 = tmp0 ^ in7;
+ tmp1 = tmp0 ^ in6;
+ out3 = out0 ^ in3;
+ out7 = tmp1 ^ in1;
+ tmp2 = tmp1 ^ in0;
+ out5 = out3 ^ in1 ^ in2;
+ out4 = tmp2 ^ in2;
+ out6 = tmp2 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_51(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in7;
+ out3 = in2 ^ in4 ^ in6 ^ in7;
+ out0 = out3 ^ in0;
+ out6 = out0 ^ in5;
+ out4 = out6 ^ in3 ^ in7;
+ out1 = out0 ^ out4 ^ in1;
+ out7 = out1 ^ in6;
+ out5 = out7 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_52(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in1 ^ in2;
+ tmp0 = in2 ^ in4;
+ tmp1 = in3 ^ in5;
+ tmp2 = in3 ^ in6;
+ tmp3 = in0 ^ in7;
+ out0 = tmp0 ^ in6;
+ out6 = tmp0 ^ tmp3;
+ out7 = tmp1 ^ in1;
+ out1 = tmp1 ^ tmp3;
+ out3 = tmp2 ^ in4;
+ out5 = tmp2 ^ in1 ^ in7;
+ out4 = tmp2 ^ out1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_53(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in1;
+ out3 = in4 ^ in6;
+ out0 = out3 ^ in0 ^ in2;
+ out6 = out0 ^ in7;
+ out4 = out6 ^ in5;
+ out7 = out0 ^ out4 ^ in1 ^ in3;
+ out1 = out7 ^ in0;
+ out5 = out7 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_54(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in3 ^ in5;
+ tmp0 = in1 ^ in3;
+ tmp1 = in2 ^ in4;
+ tmp2 = in0 ^ in7;
+ out5 = in1 ^ in4 ^ in6;
+ out4 = tmp2 ^ out1;
+ out7 = tmp0 ^ in6;
+ out3 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ in7;
+ tmp3 = tmp2 ^ in2;
+ out2 = tmp3 ^ in6;
+ out6 = tmp3 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_55(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in1 ^ in4;
+ tmp2 = in6 ^ in7;
+ out7 = tmp0 ^ tmp2;
+ out1 = tmp0 ^ in5;
+ out3 = tmp1 ^ in2;
+ out5 = tmp1 ^ in5 ^ in6;
+ out2 = tmp2 ^ in0;
+ out4 = out5 ^ out7 ^ in0;
+ out6 = out2 ^ in2 ^ in5;
+ out0 = out5 ^ out6 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_56(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in2 ^ in4;
+ tmp0 = in0 ^ in2;
+ out4 = in0 ^ in5;
+ out7 = in1 ^ in3;
+ out5 = in1 ^ in6;
+ out6 = tmp0 ^ in7;
+ out2 = tmp0 ^ out5;
+ out1 = out4 ^ in3;
+ out3 = out7 ^ in4 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_57(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in7;
+ out0 = in0 ^ in2 ^ in4;
+ out5 = in1 ^ in5 ^ in6;
+ out4 = tmp0 ^ in4;
+ out1 = tmp0 ^ in1 ^ in3;
+ out2 = tmp0 ^ out5;
+ out3 = tmp1 ^ in4;
+ out7 = tmp1 ^ in3;
+ out6 = tmp1 ^ out2 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_58(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in2 ^ in5;
+ tmp0 = in2 ^ in3 ^ in4;
+ out5 = tmp0 ^ in1;
+ out6 = tmp0 ^ in0 ^ in5;
+ out3 = out6 ^ in7;
+ tmp1 = out2 ^ out5;
+ out7 = tmp1 ^ in6;
+ out4 = tmp1 ^ out3 ^ in3;
+ out0 = out4 ^ out7 ^ in0;
+ out1 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_59(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in5;
+ tmp0 = in0 ^ in5 ^ in7;
+ out3 = tmp0 ^ in2 ^ in4;
+ out0 = out3 ^ in6;
+ tmp1 = out0 ^ in7;
+ out6 = tmp1 ^ in3;
+ out5 = out6 ^ in0 ^ in1 ^ in6;
+ out4 = tmp0 ^ out5;
+ out1 = tmp1 ^ out4;
+ out7 = out1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_5A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in5;
+ out5 = tmp0 ^ in3;
+ out4 = tmp0 ^ in0;
+ tmp2 = tmp1 ^ in4;
+ out2 = tmp1 ^ in1 ^ in7;
+ out7 = tmp2 ^ out5;
+ out6 = out4 ^ out7 ^ in5;
+ out0 = tmp2 ^ in6;
+ out1 = out0 ^ out6 ^ in7;
+ out3 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_5B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in0 ^ in4;
+ tmp2 = in1 ^ in5;
+ out5 = tmp0 ^ tmp2;
+ tmp3 = tmp1 ^ in6;
+ out3 = tmp1 ^ in5;
+ out2 = tmp2 ^ in7;
+ tmp4 = out3 ^ in2;
+ out7 = out2 ^ in3 ^ in4;
+ out0 = tmp4 ^ in6;
+ out6 = tmp0 ^ tmp3;
+ out4 = tmp2 ^ tmp4;
+ out1 = tmp3 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_5C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in6;
+ tmp1 = in0 ^ in2 ^ in5;
+ out1 = tmp0 ^ in5;
+ tmp2 = tmp0 ^ in1;
+ out2 = tmp1 ^ in6;
+ out6 = tmp1 ^ in3;
+ out4 = tmp2 ^ in0;
+ out7 = tmp2 ^ in4;
+ out3 = tmp1 ^ out7;
+ out0 = out3 ^ out4 ^ in7;
+ out5 = out0 ^ in1 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_5D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in0 ^ in6;
+ out2 = tmp1 ^ in5;
+ tmp2 = out2 ^ in3;
+ out6 = tmp2 ^ in2;
+ out1 = tmp0 ^ tmp2;
+ tmp3 = out1 ^ in4 ^ in5;
+ out4 = tmp3 ^ in0;
+ out7 = tmp3 ^ in7;
+ tmp4 = out4 ^ out6;
+ out5 = tmp4 ^ in7;
+ out0 = tmp0 ^ out5;
+ out3 = tmp1 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_5E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in3 ^ in5;
+ tmp2 = in1 ^ in7;
+ out7 = in1 ^ in3 ^ in4;
+ out0 = tmp0 ^ in4;
+ tmp3 = tmp1 ^ in0;
+ out5 = tmp2 ^ in2;
+ out1 = tmp3 ^ in6;
+ out6 = tmp0 ^ tmp3;
+ tmp4 = tmp2 ^ out1;
+ out3 = tmp4 ^ in4;
+ out4 = tmp1 ^ tmp4;
+ out2 = tmp0 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_5F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in5;
+ tmp1 = in0 ^ in6;
+ tmp2 = tmp0 ^ in7;
+ tmp3 = tmp1 ^ in3;
+ out2 = tmp1 ^ tmp2;
+ out5 = tmp2 ^ in2;
+ out6 = tmp3 ^ in2;
+ out3 = out2 ^ in4;
+ out4 = out3 ^ in5;
+ out1 = tmp0 ^ tmp3;
+ out7 = tmp3 ^ out4;
+ out0 = out4 ^ out5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_60(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in2 ^ in5;
+ tmp0 = in3 ^ in6;
+ out1 = in3 ^ in4 ^ in7;
+ out7 = out4 ^ in1;
+ tmp1 = out4 ^ in4;
+ out0 = tmp0 ^ in2;
+ out5 = tmp0 ^ in0;
+ out2 = tmp0 ^ tmp1;
+ out3 = tmp1 ^ in7;
+ out6 = out3 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_61(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ out4 = tmp0 ^ in4;
+ tmp1 = out4 ^ in3;
+ out3 = tmp1 ^ in7;
+ out2 = tmp1 ^ in2 ^ in6;
+ out1 = tmp0 ^ out3 ^ in1;
+ out0 = out2 ^ out4 ^ in0;
+ out7 = tmp1 ^ out1;
+ out6 = out0 ^ out1 ^ in2;
+ out5 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_62(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in4 ^ in5;
+ tmp0 = in0 ^ in3 ^ in4;
+ out1 = tmp0 ^ in7;
+ out5 = tmp0 ^ in6;
+ tmp1 = out1 ^ in0;
+ tmp2 = tmp1 ^ out3;
+ out4 = tmp2 ^ in2;
+ tmp3 = tmp2 ^ in1;
+ out0 = out4 ^ in5 ^ in6;
+ out7 = tmp3 ^ out0;
+ out6 = tmp0 ^ tmp3;
+ out2 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_63(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in1 ^ in7;
+ out3 = tmp0 ^ in5;
+ tmp2 = out3 ^ in6;
+ out4 = out3 ^ in2 ^ in7;
+ out5 = tmp2 ^ in0;
+ tmp3 = out5 ^ in3;
+ out0 = tmp3 ^ out4;
+ out2 = tmp1 ^ tmp2;
+ out6 = tmp1 ^ tmp3;
+ tmp4 = tmp0 ^ out2;
+ out1 = tmp4 ^ out5;
+ out7 = tmp4 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_64(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in2 ^ in3;
+ out1 = in3 ^ in4;
+ out7 = in1 ^ in2;
+ tmp0 = in4 ^ in5;
+ tmp1 = in0 ^ in7;
+ out4 = in5 ^ in6 ^ in7;
+ out2 = tmp0 ^ out0 ^ in0;
+ out3 = tmp0 ^ out7 ^ in6;
+ out5 = tmp1 ^ in6;
+ out6 = tmp1 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_65(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in4 ^ in5;
+ tmp2 = in6 ^ in7;
+ out7 = in1 ^ in2 ^ in7;
+ out1 = in1 ^ in3 ^ in4;
+ out0 = tmp0 ^ in2;
+ out2 = tmp0 ^ tmp1;
+ out4 = tmp1 ^ tmp2;
+ tmp3 = tmp2 ^ in0;
+ out3 = out4 ^ out7 ^ in3;
+ out5 = tmp3 ^ in5;
+ out6 = tmp3 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_66(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in3;
+ tmp2 = in0 ^ in4;
+ out7 = tmp0 ^ in6;
+ out0 = tmp1 ^ in7;
+ out1 = tmp2 ^ in3;
+ tmp3 = tmp2 ^ in6;
+ tmp4 = out1 ^ in5;
+ out5 = tmp3 ^ in7;
+ out4 = tmp3 ^ tmp4;
+ out2 = tmp0 ^ tmp4 ^ in7;
+ out6 = tmp1 ^ out2 ^ in4;
+ out3 = tmp3 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_67(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp0 ^ in7;
+ out1 = tmp1 ^ in4;
+ out0 = tmp2 ^ in2;
+ tmp3 = out1 ^ in7;
+ out2 = tmp3 ^ in5;
+ out3 = out2 ^ in0 ^ in6;
+ out7 = tmp1 ^ out0 ^ in6;
+ out5 = tmp1 ^ out3;
+ out4 = tmp2 ^ out5;
+ out6 = tmp3 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_68(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in2 ^ in3 ^ in5;
+ tmp2 = tmp0 ^ in1;
+ tmp3 = tmp0 ^ in6;
+ out0 = tmp1 ^ in6;
+ out6 = tmp2 ^ in0;
+ out7 = tmp1 ^ tmp2;
+ out1 = tmp3 ^ in7;
+ out2 = out1 ^ in2;
+ out4 = tmp2 ^ out2;
+ out3 = out4 ^ out6 ^ in3;
+ out5 = tmp3 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_69(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in6 ^ in7;
+ out2 = tmp0 ^ in3 ^ in4;
+ out1 = out2 ^ in1;
+ out3 = out2 ^ in0 ^ in2;
+ out4 = out1 ^ in2 ^ in3;
+ out6 = out1 ^ in0 ^ in7;
+ out7 = out4 ^ in5 ^ in6;
+ out5 = out4 ^ out6 ^ in5;
+ out0 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_6A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in6;
+ out3 = in0 ^ in4 ^ in6;
+ tmp1 = tmp0 ^ in3;
+ out4 = tmp1 ^ in1;
+ tmp2 = tmp1 ^ in7;
+ out2 = out4 ^ in4;
+ out0 = tmp2 ^ in5;
+ out5 = tmp2 ^ out3;
+ out7 = out2 ^ in3 ^ in5;
+ out1 = tmp0 ^ out5;
+ out6 = tmp1 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_6B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in6;
+ out2 = tmp0 ^ in1 ^ in3;
+ out4 = out2 ^ in2;
+ tmp1 = out2 ^ in0;
+ out7 = out4 ^ in3 ^ in5 ^ in7;
+ out1 = tmp1 ^ in7;
+ out3 = tmp1 ^ in1;
+ out6 = tmp1 ^ in5;
+ out0 = tmp1 ^ out7 ^ in6;
+ out5 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_6C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in1;
+ tmp0 = in2 ^ in3;
+ out5 = in0 ^ in2;
+ out1 = in3 ^ in4 ^ in6;
+ tmp1 = out5 ^ in1;
+ out0 = tmp0 ^ in5;
+ out6 = tmp0 ^ tmp1;
+ out3 = tmp1 ^ in4;
+ out7 = out3 ^ in0;
+ out2 = out6 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_6D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in1 ^ in4;
+ tmp0 = in0 ^ in2;
+ tmp1 = out4 ^ in3;
+ out7 = out4 ^ in2 ^ in7;
+ out5 = tmp0 ^ in5;
+ out3 = tmp0 ^ tmp1;
+ out1 = tmp1 ^ in6;
+ out0 = out5 ^ in3;
+ out2 = out3 ^ out7 ^ in4;
+ out6 = out1 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_6E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in0 ^ in4;
+ out4 = tmp0 ^ in7;
+ out6 = tmp0 ^ in0 ^ in5;
+ out5 = tmp1 ^ in2;
+ tmp2 = tmp1 ^ in3;
+ out3 = tmp2 ^ out4;
+ out1 = tmp2 ^ in6;
+ out2 = tmp0 ^ out5;
+ out0 = out2 ^ out3 ^ in5;
+ out7 = out1 ^ out2 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_6F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in7;
+ tmp1 = tmp0 ^ in4;
+ tmp2 = tmp0 ^ in0 ^ in2;
+ out4 = tmp1 ^ in1;
+ out0 = tmp2 ^ in5;
+ out3 = out4 ^ in0;
+ out2 = out3 ^ in7;
+ out1 = out2 ^ in6;
+ out6 = out1 ^ in4 ^ in5;
+ out7 = tmp2 ^ out1;
+ out5 = tmp1 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_70(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in2;
+ tmp0 = in2 ^ in4;
+ out2 = in2 ^ in3 ^ in5;
+ tmp1 = tmp0 ^ in6;
+ tmp2 = out2 ^ in7;
+ out0 = tmp1 ^ in3;
+ out4 = tmp1 ^ in0;
+ out7 = tmp2 ^ in1;
+ out6 = out4 ^ in1;
+ out5 = out7 ^ in0 ^ in2;
+ out1 = tmp0 ^ tmp2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_71(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in3 ^ in5;
+ out3 = in2 ^ in3;
+ tmp0 = in0 ^ in2;
+ tmp1 = out2 ^ in1;
+ out4 = tmp0 ^ in6;
+ tmp2 = tmp0 ^ in1;
+ out7 = tmp1 ^ in2;
+ out1 = tmp1 ^ in4 ^ in7;
+ out0 = out4 ^ in3 ^ in4;
+ out6 = tmp2 ^ in4;
+ out5 = tmp2 ^ out3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_72(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in7;
+ tmp0 = in0 ^ in4;
+ tmp1 = tmp0 ^ in3 ^ in7;
+ out1 = tmp1 ^ in5;
+ out5 = out1 ^ in1;
+ tmp2 = tmp0 ^ out5;
+ out2 = tmp2 ^ in2;
+ out7 = out2 ^ in6;
+ out6 = tmp1 ^ out7;
+ out4 = tmp2 ^ out6;
+ out0 = out4 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_73(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in3 ^ in7;
+ out2 = out3 ^ in1 ^ in5;
+ out1 = out2 ^ in0 ^ in4;
+ out5 = out1 ^ in5;
+ out6 = out1 ^ out3 ^ in2;
+ out0 = out2 ^ out6 ^ in6;
+ out7 = out0 ^ out1 ^ in3;
+ out4 = out0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_74(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in1 ^ in2 ^ in6;
+ out4 = in0 ^ in4 ^ in7;
+ out5 = in0 ^ in1 ^ in5;
+ out0 = tmp0 ^ in2;
+ out1 = tmp0 ^ in5;
+ out3 = tmp1 ^ in7;
+ out6 = tmp1 ^ in0;
+ out2 = tmp1 ^ out5 ^ in3;
+ out7 = out3 ^ in3 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_75(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in0 ^ in7;
+ tmp0 = in1 ^ in3;
+ out5 = in0 ^ in1;
+ out7 = tmp0 ^ in2;
+ tmp1 = tmp0 ^ in4;
+ out6 = out5 ^ in2;
+ tmp2 = out7 ^ in6;
+ out1 = tmp1 ^ in5;
+ out0 = tmp1 ^ out6;
+ out3 = tmp2 ^ in7;
+ out2 = tmp2 ^ out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_76(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in1 ^ in6;
+ tmp0 = in0 ^ in5;
+ tmp1 = in3 ^ in7;
+ tmp2 = tmp0 ^ in4;
+ tmp3 = tmp1 ^ in2;
+ out5 = tmp2 ^ in1;
+ out1 = tmp2 ^ in3;
+ out0 = tmp3 ^ in4;
+ out4 = out1 ^ in5;
+ out7 = tmp3 ^ out3;
+ out2 = tmp0 ^ out7;
+ out6 = tmp1 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_77(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in0 ^ in3;
+ tmp0 = in1 ^ in4;
+ tmp1 = in1 ^ in6;
+ tmp2 = out4 ^ in5;
+ out5 = tmp0 ^ in0;
+ out1 = tmp0 ^ tmp2;
+ out3 = tmp1 ^ in3;
+ out2 = tmp1 ^ tmp2 ^ in7;
+ out7 = out3 ^ in2;
+ tmp3 = out7 ^ in6;
+ out6 = tmp2 ^ tmp3;
+ out0 = tmp3 ^ out5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_78(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in2 ^ in7;
+ tmp2 = in0 ^ in5 ^ in6;
+ out2 = tmp1 ^ in3;
+ out3 = tmp2 ^ in2;
+ out5 = out3 ^ in1 ^ in3;
+ out0 = tmp0 ^ out3 ^ in4;
+ out1 = tmp1 ^ out0;
+ out4 = out1 ^ out5 ^ in5;
+ out7 = tmp0 ^ out4;
+ out6 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_79(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in3 ^ in7;
+ tmp0 = in3 ^ in4;
+ tmp1 = in1 ^ in5;
+ tmp2 = tmp1 ^ in2;
+ out4 = tmp2 ^ in0 ^ in7;
+ tmp3 = out4 ^ in5;
+ out5 = tmp3 ^ out2 ^ in6;
+ out7 = tmp0 ^ tmp2;
+ out6 = tmp0 ^ tmp3;
+ out3 = tmp1 ^ out5;
+ out0 = out3 ^ in4;
+ out1 = tmp3 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_7A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ out2 = tmp0 ^ in3;
+ tmp1 = out2 ^ in4;
+ out4 = tmp1 ^ in0 ^ in5;
+ out5 = out4 ^ in6;
+ out6 = out5 ^ in7;
+ out7 = out6 ^ in0;
+ out0 = out7 ^ in1;
+ out1 = tmp0 ^ out6;
+ out3 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_7B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in1 ^ in3;
+ tmp0 = in0 ^ in5;
+ out4 = tmp0 ^ out2 ^ in2;
+ tmp1 = out4 ^ in4;
+ out6 = tmp1 ^ in7;
+ out5 = tmp1 ^ in5 ^ in6;
+ out0 = out6 ^ in1 ^ in6;
+ tmp2 = out0 ^ in2;
+ out1 = tmp2 ^ in1;
+ out3 = tmp2 ^ in4;
+ out7 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_7C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = tmp0 ^ in4;
+ out0 = tmp1 ^ in2;
+ out1 = tmp1 ^ in6;
+ out7 = out0 ^ in1 ^ in5 ^ in7;
+ out5 = out1 ^ out7 ^ in0;
+ out3 = out5 ^ in6;
+ out6 = tmp0 ^ out5;
+ out2 = out6 ^ in1;
+ out4 = out2 ^ out7 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_7D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = tmp0 ^ in3;
+ tmp2 = tmp0 ^ in6;
+ out7 = tmp1 ^ in4;
+ tmp3 = tmp2 ^ in0;
+ out5 = tmp3 ^ in7;
+ out4 = tmp3 ^ in2 ^ in5;
+ out2 = tmp1 ^ out5;
+ out6 = tmp2 ^ out2;
+ out0 = out4 ^ out7 ^ in6;
+ out1 = tmp3 ^ out0;
+ out3 = out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_7E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in0 ^ in5;
+ out1 = tmp0 ^ tmp1 ^ in6;
+ out3 = tmp1 ^ in1;
+ out4 = out1 ^ in1 ^ in7;
+ tmp2 = out4 ^ in3;
+ out5 = tmp2 ^ in2;
+ out6 = tmp0 ^ out5;
+ out7 = tmp1 ^ out4 ^ in2;
+ out2 = out6 ^ in5 ^ in7;
+ out0 = tmp2 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_7F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in7;
+ tmp1 = tmp0 ^ in3 ^ in5;
+ tmp2 = tmp1 ^ in0;
+ out0 = tmp2 ^ in4;
+ out6 = tmp2 ^ in1;
+ out3 = tmp0 ^ out6;
+ tmp3 = out3 ^ in6;
+ out1 = tmp3 ^ in4;
+ out2 = tmp3 ^ in5;
+ out4 = tmp3 ^ in7;
+ out5 = tmp1 ^ out1;
+ out7 = out0 ^ out4 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_80(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in4 ^ in5;
+ out1 = in2 ^ in6 ^ in7;
+ out5 = tmp0 ^ in4;
+ tmp2 = tmp0 ^ in1;
+ out6 = tmp1 ^ in3;
+ out7 = tmp1 ^ in0 ^ in6;
+ out4 = tmp2 ^ in7;
+ out3 = tmp2 ^ out6;
+ out2 = out3 ^ out5 ^ in6;
+ out0 = out2 ^ in3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_81(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in6;
+ tmp1 = tmp0 ^ in3;
+ out6 = tmp1 ^ in5;
+ out5 = out6 ^ in2 ^ in6;
+ out3 = out5 ^ in1;
+ out2 = tmp0 ^ out3;
+ out1 = out3 ^ out6 ^ in7;
+ out4 = tmp1 ^ out1;
+ out7 = out2 ^ out4 ^ in0;
+ out0 = out7 ^ in1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_82(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in1 ^ in2;
+ tmp0 = in6 ^ in7;
+ out5 = in2 ^ in3;
+ out6 = in3 ^ in4;
+ out7 = in0 ^ in4 ^ in5;
+ out0 = in1 ^ in5 ^ in6;
+ out1 = tmp0 ^ in0 ^ in2;
+ out2 = tmp0 ^ in3 ^ in5;
+ out3 = tmp0 ^ out0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_83(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in5;
+ tmp2 = in3 ^ in6;
+ out4 = in1 ^ in2 ^ in4;
+ out0 = tmp0 ^ in5 ^ in6;
+ out5 = tmp1 ^ in3;
+ tmp3 = tmp1 ^ in7;
+ out6 = tmp2 ^ in4;
+ out2 = tmp2 ^ tmp3;
+ tmp4 = tmp3 ^ out4;
+ out1 = tmp3 ^ out0;
+ out3 = tmp4 ^ in3;
+ out7 = tmp0 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_84(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in2 ^ in6;
+ out6 = in3 ^ in5;
+ out0 = in1 ^ in5 ^ in7;
+ out7 = in0 ^ in4 ^ in6;
+ out4 = in1 ^ in3 ^ in6;
+ out5 = in2 ^ in4 ^ in7;
+ out2 = out6 ^ in0 ^ in1;
+ out3 = out5 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_85(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in6;
+ tmp1 = in3 ^ in6;
+ tmp2 = tmp0 ^ in4;
+ out1 = tmp0 ^ in2;
+ out6 = tmp1 ^ in5;
+ out4 = tmp2 ^ in3;
+ tmp3 = out1 ^ out6;
+ out2 = tmp3 ^ in0;
+ out3 = tmp2 ^ tmp3 ^ in7;
+ out7 = out2 ^ out3 ^ in1;
+ out5 = tmp1 ^ out3;
+ out0 = tmp2 ^ out7 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_86(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out6 = in3;
+ out7 = in0 ^ in4;
+ out0 = in1 ^ in5;
+ out5 = in2 ^ in7;
+ out3 = in4 ^ in5 ^ in6;
+ out1 = in0 ^ in2 ^ in6;
+ out4 = in1 ^ in6 ^ in7;
+ out2 = in0 ^ in3 ^ in5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_87(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out6 = in3 ^ in6;
+ tmp0 = in0 ^ in1;
+ out7 = in0 ^ in4 ^ in7;
+ out5 = in2 ^ in5 ^ in7;
+ out3 = out6 ^ in4 ^ in5;
+ out0 = tmp0 ^ in5;
+ tmp1 = tmp0 ^ in6;
+ out2 = out5 ^ in0 ^ in3;
+ out1 = tmp1 ^ in2;
+ out4 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_88(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in2 ^ in7;
+ tmp0 = in5 ^ in6;
+ out0 = in1 ^ in6 ^ in7;
+ out6 = in4 ^ in5 ^ in7;
+ out3 = out0 ^ out1 ^ in0 ^ in4;
+ out7 = tmp0 ^ in0;
+ tmp1 = tmp0 ^ in3;
+ out2 = out0 ^ in3;
+ out4 = tmp1 ^ in2;
+ out5 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_89(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in7;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in6;
+ out1 = tmp1 ^ in1;
+ out7 = tmp2 ^ in5;
+ out0 = tmp2 ^ in1;
+ out2 = out1 ^ in3 ^ in6;
+ out6 = out7 ^ in0 ^ in4;
+ out5 = out6 ^ in3;
+ out3 = tmp0 ^ out2 ^ in4;
+ out4 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_8A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in1 ^ in6;
+ out7 = in0 ^ in5;
+ out2 = in3 ^ in6;
+ out6 = in4 ^ in7;
+ out1 = in0 ^ in2 ^ in7;
+ out3 = out0 ^ out6 ^ in0;
+ out4 = out1 ^ out7 ^ in6;
+ out5 = out2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_8B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in3 ^ in6;
+ tmp2 = in5 ^ in7;
+ tmp3 = tmp0 ^ in7;
+ out0 = tmp0 ^ in6;
+ out2 = tmp1 ^ in2;
+ out5 = tmp1 ^ tmp2;
+ out7 = tmp2 ^ in0;
+ tmp4 = tmp3 ^ in4;
+ out1 = tmp3 ^ in2;
+ out6 = tmp4 ^ out0;
+ out4 = out6 ^ in2 ^ in5;
+ out3 = tmp1 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_8C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in2;
+ out0 = in1 ^ in7;
+ out7 = in0 ^ in6;
+ out5 = in4 ^ in6;
+ out6 = in5 ^ in7;
+ out2 = out0 ^ in0 ^ in3;
+ out3 = out5 ^ out7 ^ in2 ^ in7;
+ out4 = out6 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_8D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in1 ^ in2;
+ tmp0 = in6 ^ in7;
+ out0 = in0 ^ in1 ^ in7;
+ out5 = in4 ^ in5 ^ in6;
+ out6 = tmp0 ^ in5;
+ out7 = tmp0 ^ in0;
+ out4 = tmp0 ^ out5 ^ in3;
+ out2 = out0 ^ in2 ^ in3;
+ out3 = out2 ^ in1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_8E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in1;
+ out4 = in5;
+ out7 = in0;
+ out5 = in6;
+ out6 = in7;
+ out3 = in0 ^ in4;
+ out1 = in0 ^ in2;
+ out2 = in0 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_8F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in0 ^ in1;
+ tmp0 = in0 ^ in3;
+ out4 = in4 ^ in5;
+ out7 = in0 ^ in7;
+ out5 = in5 ^ in6;
+ out6 = in6 ^ in7;
+ out1 = out0 ^ in2;
+ out2 = tmp0 ^ in2;
+ out3 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_90(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in6 ^ in7;
+ out3 = tmp0 ^ in7;
+ out1 = tmp1 ^ in5;
+ tmp2 = out1 ^ in4;
+ out6 = tmp2 ^ in3;
+ out5 = out6 ^ in1;
+ out4 = out5 ^ in0;
+ out0 = tmp0 ^ tmp2;
+ out7 = tmp0 ^ out4;
+ out2 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_91(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = tmp0 ^ in3 ^ in5;
+ out2 = tmp1 ^ in1;
+ out6 = tmp1 ^ in7;
+ tmp2 = out2 ^ in5 ^ in7;
+ out3 = tmp2 ^ in4;
+ out5 = tmp2 ^ in6;
+ out1 = tmp1 ^ out5 ^ in2;
+ tmp3 = out1 ^ in0;
+ out4 = tmp3 ^ in3;
+ out0 = tmp0 ^ tmp3;
+ out7 = tmp2 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_92(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in1;
+ tmp0 = in4 ^ in5;
+ tmp1 = tmp0 ^ in1;
+ out2 = tmp0 ^ in3 ^ in7;
+ out0 = tmp1 ^ in6;
+ out7 = out2 ^ in0;
+ out4 = out0 ^ in0 ^ in2;
+ out5 = out4 ^ out7 ^ in5;
+ out6 = tmp1 ^ out5;
+ out1 = out6 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_93(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in1 ^ in3;
+ tmp0 = in2 ^ in7;
+ tmp1 = out3 ^ in6;
+ tmp2 = tmp0 ^ in4;
+ out5 = tmp0 ^ tmp1;
+ out6 = tmp2 ^ in3;
+ out2 = out6 ^ in5;
+ out0 = out2 ^ out5 ^ in0;
+ out7 = tmp1 ^ out0;
+ out1 = tmp2 ^ out0;
+ out4 = out1 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_94(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in2 ^ in6;
+ tmp0 = in1 ^ in4 ^ in5;
+ out1 = out3 ^ in5;
+ out5 = tmp0 ^ out3;
+ out0 = tmp0 ^ in7;
+ out4 = tmp0 ^ in0 ^ in3;
+ out6 = out1 ^ in3 ^ in7;
+ out2 = out4 ^ in6;
+ out7 = out0 ^ out2 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_95(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ out3 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in7;
+ tmp2 = out3 ^ in0;
+ out6 = tmp1 ^ in5;
+ tmp3 = tmp2 ^ in4;
+ out7 = tmp3 ^ in2;
+ tmp4 = tmp3 ^ in5;
+ out2 = tmp4 ^ in1;
+ tmp5 = out2 ^ in6;
+ out0 = tmp1 ^ tmp5;
+ out1 = tmp5 ^ out7;
+ out4 = tmp2 ^ out1;
+ out5 = tmp4 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_96(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in6 ^ in7;
+ tmp0 = in1 ^ in5;
+ tmp1 = in5 ^ in6;
+ out6 = out3 ^ in2 ^ in3;
+ out0 = tmp0 ^ in4;
+ tmp2 = tmp1 ^ in2;
+ out4 = out0 ^ in0 ^ in7;
+ out1 = tmp2 ^ in0;
+ out5 = tmp2 ^ in1;
+ out7 = tmp0 ^ out4 ^ in3;
+ out2 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_97(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in4;
+ tmp1 = in2 ^ in6;
+ out3 = in3 ^ in6 ^ in7;
+ out7 = tmp0 ^ in3;
+ tmp2 = tmp0 ^ in5;
+ out5 = tmp1 ^ in1;
+ out6 = tmp1 ^ out3;
+ out0 = tmp2 ^ in1;
+ out2 = tmp2 ^ out3 ^ in2;
+ tmp3 = out0 ^ in4;
+ out4 = tmp3 ^ in7;
+ out1 = tmp1 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_98(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in5 ^ in7;
+ tmp1 = in1 ^ in4 ^ in7;
+ out1 = tmp0 ^ in2;
+ out0 = tmp1 ^ in6;
+ out2 = tmp1 ^ in3;
+ out6 = out0 ^ out1 ^ in1;
+ out5 = tmp0 ^ out2;
+ out3 = tmp1 ^ out6 ^ in0;
+ out7 = out0 ^ out5 ^ in0;
+ out4 = out6 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_99(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ out5 = in1 ^ in3 ^ in4;
+ out6 = in2 ^ in4 ^ in5;
+ out4 = tmp0 ^ in2;
+ tmp1 = tmp0 ^ in6;
+ tmp2 = out5 ^ in7;
+ out7 = tmp1 ^ in5;
+ out0 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ in2;
+ out3 = out0 ^ out6 ^ in3;
+ out1 = tmp1 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_9A(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in3 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in6;
+ out5 = in1 ^ in3 ^ in5;
+ tmp2 = tmp0 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ in4;
+ out7 = tmp2 ^ in3;
+ out1 = tmp2 ^ in2;
+ out6 = out0 ^ in1 ^ in2;
+ out4 = out1 ^ in4 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_9B(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in1 ^ in3;
+ tmp0 = in3 ^ in5;
+ out6 = in2 ^ in4;
+ out4 = in0 ^ in2 ^ in7;
+ out7 = tmp0 ^ in0;
+ out2 = out6 ^ in3;
+ out1 = out4 ^ in1 ^ in5;
+ out3 = out7 ^ in1 ^ in6;
+ out0 = tmp0 ^ out3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_9C(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out1 = in2 ^ in5;
+ tmp0 = in0 ^ in3 ^ in6;
+ out3 = out1 ^ in0;
+ out6 = out1 ^ in6;
+ out7 = tmp0 ^ in7;
+ out4 = out7 ^ in4;
+ out2 = out4 ^ in1;
+ out0 = tmp0 ^ out2;
+ out5 = out0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_9D(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out6 = in2 ^ in5;
+ tmp0 = in0 ^ in3;
+ out5 = in1 ^ in4 ^ in7;
+ out1 = out6 ^ in1;
+ out3 = tmp0 ^ out6;
+ out7 = tmp0 ^ in6;
+ out0 = out5 ^ in0;
+ out4 = out7 ^ in7;
+ out2 = out5 ^ out7 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_9E(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in1 ^ in4;
+ tmp0 = in0 ^ in5;
+ out6 = in2 ^ in6;
+ out7 = in0 ^ in3 ^ in7;
+ out4 = in0 ^ in4 ^ in6;
+ out5 = in1 ^ in5 ^ in7;
+ out1 = tmp0 ^ in2;
+ out3 = tmp0 ^ in7;
+ out2 = out4 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_9F(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out6 = in2;
+ out7 = in0 ^ in3;
+ tmp0 = in0 ^ in1;
+ out4 = in0 ^ in6;
+ out5 = in1 ^ in7;
+ out1 = tmp0 ^ in2 ^ in5;
+ out2 = out7 ^ in2 ^ in4 ^ in6;
+ out3 = out7 ^ in5 ^ in7;
+ out0 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A0(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in6;
+ out2 = tmp0 ^ in7;
+ tmp1 = tmp0 ^ in5;
+ out6 = out2 ^ in3 ^ in4;
+ out0 = tmp1 ^ in3;
+ tmp2 = out0 ^ in2;
+ out3 = tmp2 ^ in7;
+ tmp3 = tmp2 ^ in1;
+ out5 = tmp3 ^ in0;
+ out4 = tmp3 ^ out6;
+ out7 = out5 ^ out6 ^ in1;
+ out1 = tmp1 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A1(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp0 ^ in4;
+ out4 = tmp1 ^ in7;
+ out7 = tmp2 ^ in0;
+ out6 = tmp2 ^ out4 ^ in3;
+ out3 = out4 ^ in6;
+ out2 = out3 ^ in5;
+ out1 = out2 ^ in4;
+ out5 = out1 ^ out6 ^ in0;
+ out0 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A2(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in6;
+ tmp0 = in1 ^ in3 ^ in5;
+ out3 = tmp0 ^ in6;
+ out4 = tmp0 ^ in2 ^ in4;
+ out0 = out3 ^ in7;
+ out6 = out0 ^ in4;
+ out1 = out0 ^ out4 ^ in0;
+ out7 = out1 ^ in5;
+ out5 = out7 ^ in3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A3(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in2 ^ in6;
+ out3 = in1 ^ in5 ^ in6;
+ tmp0 = out2 ^ in0;
+ out4 = out2 ^ out3 ^ in3;
+ tmp1 = tmp0 ^ in4;
+ out0 = tmp0 ^ out4 ^ in7;
+ out5 = tmp1 ^ in3;
+ out7 = tmp1 ^ in5;
+ out1 = tmp1 ^ in1 ^ in7;
+ out6 = tmp1 ^ out0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A4(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in2 ^ in4;
+ tmp2 = in2 ^ in5;
+ tmp3 = in0 ^ in7;
+ out0 = tmp0 ^ in5;
+ out6 = tmp0 ^ in6 ^ in7;
+ out1 = tmp1 ^ in6;
+ out7 = tmp1 ^ tmp3;
+ out3 = tmp2 ^ in3;
+ tmp4 = tmp2 ^ out1;
+ out2 = tmp3 ^ in1;
+ out5 = tmp4 ^ out7;
+ out4 = tmp4 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A5(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in2 ^ in5;
+ tmp0 = in1 ^ in6;
+ tmp1 = in0 ^ in1;
+ tmp2 = in2 ^ in4;
+ out6 = in1 ^ in3 ^ in7;
+ out4 = tmp0 ^ in5;
+ out1 = tmp0 ^ tmp2;
+ out0 = tmp1 ^ in3 ^ in5;
+ out2 = tmp1 ^ in2 ^ in7;
+ out7 = tmp2 ^ in0;
+ out5 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A6(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0;
+ out3 = in3 ^ in5 ^ in7;
+ out1 = in0 ^ in2 ^ in4 ^ in6;
+ out0 = out3 ^ in1;
+ out7 = out1 ^ in7;
+ out6 = out0 ^ in6;
+ out5 = out7 ^ in5;
+ out4 = out6 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A7(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0 ^ in2;
+ out3 = in5 ^ in7;
+ out7 = out2 ^ in4 ^ in6;
+ out6 = out3 ^ in1 ^ in3;
+ out1 = out7 ^ in1;
+ out5 = out7 ^ in7;
+ out0 = out6 ^ in0;
+ out4 = out6 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A8(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = in1 ^ in6;
+ tmp2 = in0 ^ in2 ^ in7;
+ out1 = tmp0 ^ in7;
+ out4 = tmp0 ^ in6;
+ out0 = tmp1 ^ in3;
+ out2 = tmp1 ^ in5;
+ out6 = tmp1 ^ in4;
+ out7 = tmp2 ^ in5;
+ out3 = tmp2 ^ out0 ^ in6;
+ out5 = out7 ^ in2 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_A9(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in2 ^ in6;
+ out6 = in1 ^ in4;
+ out7 = in0 ^ in2 ^ in5;
+ out5 = in0 ^ in3 ^ in7;
+ out2 = out4 ^ in1 ^ in5;
+ out1 = out6 ^ in2 ^ in7;
+ out0 = out2 ^ out7 ^ in3;
+ out3 = out1 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_AA(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in1 ^ in3;
+ tmp2 = in6 ^ in7;
+ out1 = tmp0 ^ in4 ^ in7;
+ out3 = tmp1 ^ in0;
+ out0 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ in5;
+ out7 = tmp0 ^ out2;
+ out6 = out1 ^ out7 ^ in1;
+ out5 = out0 ^ out6 ^ in0;
+ out4 = out5 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_AB(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0 ^ in1;
+ tmp0 = in1 ^ in4;
+ tmp1 = in0 ^ in7;
+ out6 = tmp0 ^ in5;
+ out1 = tmp0 ^ tmp1 ^ in2;
+ out5 = tmp1 ^ in3 ^ in4;
+ out0 = tmp0 ^ out5 ^ in6;
+ out4 = out0 ^ out3 ^ in2;
+ out2 = out4 ^ in3 ^ in5;
+ out7 = tmp1 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_AC(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in1 ^ in3;
+ out1 = in2 ^ in4;
+ tmp0 = in0 ^ in2;
+ out4 = in4 ^ in7;
+ out5 = in0 ^ in5;
+ out6 = in1 ^ in6;
+ out7 = tmp0 ^ in7;
+ out3 = tmp0 ^ in3 ^ in6;
+ out2 = out5 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_AD(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in7;
+ out5 = in0;
+ out6 = in1;
+ out7 = in0 ^ in2;
+ out0 = in0 ^ in1 ^ in3;
+ out2 = out7 ^ in1 ^ in5;
+ out1 = in1 ^ in2 ^ in4;
+ out3 = out7 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_AE(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in3 ^ in4;
+ tmp0 = in0 ^ in4;
+ tmp1 = in0 ^ in7;
+ out0 = in1 ^ in3 ^ in7;
+ out1 = tmp0 ^ in2;
+ out5 = tmp0 ^ in5;
+ tmp2 = tmp1 ^ in6;
+ out2 = tmp1 ^ in5;
+ out3 = tmp2 ^ in3;
+ out7 = tmp2 ^ in2;
+ out6 = tmp2 ^ out2 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_AF(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in3;
+ tmp0 = in0 ^ in7;
+ out5 = in0 ^ in4;
+ out6 = in1 ^ in5;
+ out7 = in0 ^ in2 ^ in6;
+ out0 = tmp0 ^ in1 ^ in3;
+ out3 = tmp0 ^ in6;
+ out2 = tmp0 ^ in2 ^ in5;
+ out1 = out5 ^ in1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B0(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in4;
+ tmp1 = in3 ^ in6;
+ out2 = tmp0 ^ in7;
+ tmp2 = tmp0 ^ tmp1;
+ out0 = tmp2 ^ in5;
+ out3 = tmp2 ^ in2;
+ out6 = out3 ^ in6;
+ tmp3 = out6 ^ in0 ^ in1;
+ out7 = tmp3 ^ in5;
+ out5 = tmp3 ^ out2;
+ out1 = out0 ^ out5 ^ in0;
+ out4 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B1(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in4;
+ out2 = tmp0 ^ in2 ^ in7;
+ tmp1 = out2 ^ in6;
+ out1 = tmp1 ^ in5;
+ out3 = tmp1 ^ in7;
+ out4 = tmp1 ^ in0;
+ out6 = out3 ^ in3;
+ out0 = out6 ^ in0 ^ in2 ^ in5;
+ out5 = tmp1 ^ out0 ^ in1;
+ out7 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B2(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in4;
+ tmp0 = in4 ^ in7;
+ tmp1 = in1 ^ in3 ^ in6;
+ out3 = tmp0 ^ tmp1;
+ tmp2 = tmp1 ^ in0;
+ out0 = out3 ^ in5;
+ out4 = tmp2 ^ in2;
+ tmp3 = out4 ^ in6;
+ out5 = tmp0 ^ tmp3;
+ out1 = tmp3 ^ out0;
+ tmp4 = out1 ^ in7;
+ out7 = tmp4 ^ in3;
+ out6 = tmp2 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B3(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in2 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in6;
+ out3 = tmp1 ^ in4 ^ in7;
+ tmp2 = tmp0 ^ out3;
+ out0 = tmp2 ^ in3;
+ out1 = tmp2 ^ in2;
+ out5 = out0 ^ in2 ^ in6;
+ out7 = tmp1 ^ out5;
+ out4 = out7 ^ in1 ^ in5 ^ in7;
+ out6 = tmp0 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B4(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in0 ^ in1;
+ out5 = out4 ^ in2;
+ tmp0 = out4 ^ in4;
+ out6 = out5 ^ in0 ^ in3;
+ out7 = tmp0 ^ out6;
+ out2 = tmp0 ^ in6 ^ in7;
+ out3 = out7 ^ in0 ^ in7;
+ out0 = out5 ^ out7 ^ in5;
+ out1 = out0 ^ out6 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B5(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in4;
+ out4 = tmp0 ^ in4;
+ out3 = tmp1 ^ in7;
+ tmp2 = out4 ^ in5;
+ out7 = out3 ^ in0 ^ in3;
+ out0 = tmp2 ^ in3;
+ out2 = tmp0 ^ out3 ^ in6;
+ out5 = tmp1 ^ tmp2;
+ out6 = out2 ^ out7 ^ in2;
+ out1 = tmp0 ^ out0 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B6(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in3 ^ in4;
+ tmp0 = in1 ^ in2;
+ tmp1 = in0 ^ in4;
+ tmp2 = in3 ^ in5;
+ tmp3 = out3 ^ in1 ^ in7;
+ out5 = tmp0 ^ tmp1;
+ out6 = tmp0 ^ tmp2;
+ out2 = tmp1 ^ in6;
+ out4 = tmp1 ^ tmp3;
+ out0 = tmp3 ^ in5;
+ out1 = out2 ^ in2 ^ in5;
+ out7 = tmp2 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B7(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in4;
+ tmp0 = in0 ^ in4;
+ out2 = tmp0 ^ in2 ^ in6;
+ tmp1 = out2 ^ in7;
+ out1 = out2 ^ in1 ^ in5;
+ out7 = tmp1 ^ in3;
+ out5 = out1 ^ in6;
+ out6 = tmp0 ^ out1 ^ in3;
+ out0 = tmp1 ^ out6;
+ out4 = out0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B8(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in4;
+ tmp1 = in2 ^ in5;
+ out2 = tmp0 ^ in5;
+ out4 = tmp1 ^ in0;
+ tmp2 = tmp1 ^ in7;
+ out6 = tmp2 ^ out2;
+ out7 = out4 ^ in3;
+ out1 = tmp2 ^ in4;
+ out3 = tmp0 ^ out7;
+ out0 = out3 ^ out4 ^ in6;
+ out5 = out0 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_B9(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in4 ^ in5;
+ out4 = tmp0 ^ tmp1;
+ tmp2 = tmp0 ^ in3 ^ in7;
+ out3 = out4 ^ in1;
+ out7 = tmp2 ^ in5;
+ out2 = out3 ^ in0;
+ out1 = out2 ^ in7;
+ out6 = out1 ^ in5 ^ in6;
+ out0 = tmp2 ^ out6;
+ out5 = tmp1 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_BA(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in5 ^ in7;
+ out2 = tmp0 ^ in4;
+ tmp1 = out2 ^ in2;
+ out1 = tmp1 ^ in0;
+ out6 = tmp1 ^ in1;
+ out4 = out1 ^ in3 ^ in4;
+ tmp2 = out4 ^ out6;
+ out7 = out4 ^ in6 ^ in7;
+ out5 = tmp2 ^ in6;
+ out3 = tmp0 ^ tmp2;
+ out0 = out6 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_BB(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in2 ^ in4 ^ in5 ^ in7;
+ tmp0 = out2 ^ in1;
+ out4 = out2 ^ in0 ^ in3;
+ out1 = tmp0 ^ in0;
+ out6 = tmp0 ^ in6;
+ out3 = out1 ^ in2;
+ tmp1 = out4 ^ out6 ^ in4;
+ out0 = tmp1 ^ in7;
+ out5 = tmp1 ^ in5;
+ out7 = tmp0 ^ tmp1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_BC(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in2 ^ in4;
+ out0 = in1 ^ in3 ^ in4;
+ out6 = in1 ^ in2 ^ in7;
+ out7 = tmp0 ^ in3;
+ out5 = tmp0 ^ out6 ^ in6;
+ out1 = tmp1 ^ in5;
+ tmp2 = out1 ^ out5 ^ in1;
+ out3 = tmp2 ^ in3;
+ out4 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_BD(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in1 ^ in4;
+ out0 = tmp0 ^ tmp1;
+ out7 = tmp0 ^ in2 ^ in7;
+ out1 = tmp1 ^ in2 ^ in5;
+ tmp2 = out1 ^ in0;
+ out2 = tmp2 ^ in6;
+ out3 = out2 ^ in1 ^ in7;
+ out4 = out3 ^ in2;
+ out5 = tmp1 ^ out4;
+ out6 = tmp2 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_BE(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3 ^ in6;
+ out4 = tmp0 ^ in5;
+ out7 = tmp0 ^ in2;
+ out3 = out4 ^ in4;
+ out1 = out3 ^ out7 ^ in0;
+ out2 = out3 ^ in3 ^ in7;
+ out0 = out2 ^ out4 ^ in1;
+ out5 = tmp0 ^ out0;
+ out6 = out1 ^ out5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_BF(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in4;
+ out3 = tmp0 ^ in5 ^ in6;
+ out4 = out3 ^ in3;
+ tmp1 = out3 ^ in7;
+ out2 = tmp1 ^ in2;
+ out5 = tmp1 ^ in1;
+ tmp2 = out2 ^ in5;
+ out7 = tmp2 ^ in3 ^ in4;
+ tmp3 = tmp0 ^ out5;
+ out0 = tmp3 ^ out4;
+ out1 = tmp2 ^ tmp3;
+ out6 = tmp3 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C0(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in2 ^ in5;
+ tmp0 = in1 ^ in4;
+ tmp1 = in3 ^ in6;
+ out0 = out5 ^ in1;
+ out4 = tmp0 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out1 = tmp1 ^ in2;
+ out6 = tmp1 ^ in0;
+ out7 = out4 ^ in0;
+ out2 = out4 ^ out5 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C1(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in2;
+ tmp0 = in0 ^ in1;
+ out4 = in1 ^ in7;
+ out6 = in0 ^ in3;
+ out3 = in1 ^ in4 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out7 = tmp0 ^ in4;
+ out0 = tmp1 ^ in5;
+ out1 = tmp1 ^ out6 ^ in6;
+ out2 = out6 ^ out7 ^ in5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C2(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in1 ^ in3 ^ in4;
+ tmp0 = in0 ^ in3 ^ in6;
+ out5 = in2 ^ in4 ^ in5;
+ tmp1 = out4 ^ in7;
+ out1 = tmp0 ^ in2;
+ out6 = tmp0 ^ in5;
+ out2 = out5 ^ in3;
+ out7 = tmp0 ^ tmp1;
+ out3 = tmp1 ^ in2 ^ in6;
+ out0 = tmp1 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C3(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in1 ^ in3;
+ tmp0 = in0 ^ in2;
+ tmp1 = in3 ^ in5;
+ out5 = in2 ^ in4;
+ tmp2 = tmp0 ^ out4;
+ out2 = tmp1 ^ in4;
+ out6 = tmp1 ^ in0;
+ out0 = tmp1 ^ tmp2 ^ in7;
+ out1 = tmp2 ^ in6;
+ out7 = out1 ^ out5 ^ in3;
+ out3 = tmp0 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C4(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in7;
+ out3 = tmp0 ^ in4;
+ tmp1 = tmp0 ^ in2;
+ out1 = tmp1 ^ in6;
+ out5 = tmp1 ^ in5;
+ out4 = out1 ^ out3 ^ in1;
+ out0 = out4 ^ in4 ^ in5;
+ out2 = out0 ^ out3 ^ in0;
+ out7 = out1 ^ out2 ^ in7;
+ out6 = tmp1 ^ out0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C5(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in4 ^ in7;
+ tmp0 = in3 ^ in7;
+ out4 = in1 ^ in2 ^ in6;
+ out6 = in0 ^ in3 ^ in4;
+ out5 = tmp0 ^ in2;
+ out1 = tmp0 ^ out4;
+ out0 = out4 ^ in0 ^ in5;
+ out2 = out0 ^ out5 ^ in4;
+ out7 = tmp0 ^ out2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C6(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in5 ^ in6;
+ tmp1 = in1 ^ in7;
+ tmp2 = tmp0 ^ in0;
+ tmp3 = tmp0 ^ tmp1;
+ tmp4 = tmp2 ^ in4;
+ out0 = tmp3 ^ in2;
+ out6 = tmp4 ^ in3;
+ out2 = out6 ^ in2;
+ out7 = tmp1 ^ tmp4;
+ out3 = tmp2 ^ out2;
+ tmp5 = out3 ^ in5;
+ out5 = tmp5 ^ in7;
+ out4 = tmp3 ^ tmp5;
+ out1 = tmp4 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C7(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in2 ^ in4;
+ tmp0 = in3 ^ in5;
+ tmp1 = out3 ^ in7;
+ out6 = tmp0 ^ in0 ^ in4;
+ out5 = tmp1 ^ in3;
+ out2 = out6 ^ in6;
+ out7 = out2 ^ in1 ^ in3;
+ out0 = tmp1 ^ out7;
+ out1 = tmp0 ^ out0;
+ out4 = out1 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C8(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out0 = in1 ^ in2;
+ out1 = in2 ^ in3;
+ tmp0 = in5 ^ in6;
+ tmp1 = in0 ^ in7;
+ out2 = out1 ^ in1 ^ in4;
+ out4 = tmp0 ^ in4;
+ out5 = tmp0 ^ in7;
+ out6 = tmp1 ^ in6;
+ out7 = tmp1 ^ in1;
+ out3 = out2 ^ in0 ^ in2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_C9(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in5 ^ in6;
+ out7 = in0 ^ in1;
+ tmp0 = in1 ^ in3;
+ out5 = in6 ^ in7;
+ out6 = in0 ^ in7;
+ out0 = out7 ^ in2;
+ out3 = out7 ^ in4 ^ in5;
+ out1 = tmp0 ^ in2;
+ out2 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_CA(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in7;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in6;
+ out0 = tmp1 ^ in1;
+ tmp3 = tmp1 ^ in3;
+ out6 = tmp2 ^ in5;
+ out7 = tmp2 ^ in1;
+ out2 = tmp3 ^ in4;
+ out5 = out6 ^ in0 ^ in4;
+ out4 = out5 ^ in3;
+ out1 = tmp0 ^ tmp3;
+ out3 = tmp3 ^ out5 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_CB(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in7;
+ tmp1 = in5 ^ in7;
+ out7 = in0 ^ in1 ^ in6;
+ out5 = tmp0 ^ in6;
+ out2 = tmp0 ^ in3;
+ out6 = tmp1 ^ in0;
+ out4 = tmp1 ^ in3 ^ in6;
+ tmp2 = out5 ^ out7 ^ in2;
+ out1 = tmp2 ^ out2;
+ out0 = tmp2 ^ in4;
+ out3 = tmp2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_CC(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = in1 ^ in6;
+ out1 = in2 ^ in3 ^ in7;
+ out5 = tmp0 ^ in6;
+ out0 = tmp1 ^ in2;
+ tmp2 = out5 ^ in0 ^ in7;
+ out3 = tmp2 ^ in4;
+ out6 = tmp0 ^ out3;
+ out7 = tmp1 ^ tmp2 ^ in3;
+ tmp3 = out1 ^ out6;
+ out4 = tmp2 ^ tmp3;
+ out2 = tmp3 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_CD(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in3 ^ in6;
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in7;
+ out6 = in0 ^ in4 ^ in7;
+ out2 = tmp0 ^ out5 ^ in4;
+ out7 = tmp0 ^ in5;
+ out0 = tmp0 ^ in2 ^ in6;
+ out4 = tmp1 ^ in5;
+ out1 = tmp1 ^ in1 ^ in3;
+ out3 = out6 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_CE(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = tmp0 ^ in3;
+ out4 = tmp1 ^ in4;
+ tmp2 = out4 ^ in6;
+ out3 = tmp2 ^ in0;
+ out5 = tmp2 ^ in2;
+ out2 = out3 ^ in5 ^ in7;
+ out6 = tmp1 ^ out2;
+ out7 = out2 ^ out4 ^ in1;
+ out1 = tmp2 ^ out6;
+ out0 = tmp0 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_CF(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in6;
+ tmp1 = in0 ^ in1 ^ in5;
+ out4 = in2 ^ in3 ^ in5;
+ out5 = tmp0 ^ in4;
+ out7 = tmp1 ^ in6;
+ out1 = tmp1 ^ out4 ^ in7;
+ tmp2 = out5 ^ in0;
+ out2 = tmp2 ^ in7;
+ out3 = tmp2 ^ out4;
+ out6 = tmp0 ^ out2 ^ in5;
+ out0 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D0(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in1 ^ in4;
+ tmp2 = in2 ^ in5;
+ out7 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ tmp2;
+ tmp3 = tmp2 ^ in3;
+ out1 = tmp3 ^ in6;
+ tmp4 = out1 ^ in1;
+ out2 = tmp4 ^ in7;
+ out3 = out2 ^ in2;
+ out4 = tmp0 ^ out3;
+ out5 = tmp3 ^ out3;
+ out6 = tmp4 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D1(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in5 ^ in6;
+ tmp1 = tmp0 ^ in1;
+ out1 = tmp1 ^ in2;
+ out2 = tmp1 ^ in7;
+ out3 = out2 ^ in3;
+ out5 = out3 ^ in2;
+ tmp2 = out3 ^ in0;
+ out4 = tmp2 ^ in4;
+ out7 = tmp0 ^ out4;
+ out6 = tmp2 ^ out1 ^ in6;
+ out0 = out2 ^ out6 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D2(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in5 ^ in6;
+ out2 = tmp0 ^ in2 ^ in3;
+ out1 = out2 ^ in0;
+ out3 = out2 ^ in1;
+ out4 = out1 ^ in1 ^ in2;
+ out6 = out1 ^ in6 ^ in7;
+ out7 = out4 ^ in4 ^ in5;
+ out5 = out4 ^ out6 ^ in4;
+ out0 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D3(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in3 ^ in5 ^ in6;
+ tmp0 = out2 ^ in2;
+ tmp1 = tmp0 ^ in1;
+ out1 = tmp1 ^ in0;
+ out3 = tmp1 ^ in3;
+ out4 = out1 ^ in2 ^ in4;
+ tmp2 = out4 ^ in5;
+ out7 = tmp2 ^ in7;
+ out0 = tmp0 ^ out7;
+ tmp3 = out0 ^ in0;
+ out5 = tmp3 ^ in6;
+ out6 = tmp2 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D4(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in3 ^ in5;
+ tmp0 = in1 ^ in5;
+ tmp1 = tmp0 ^ in2;
+ out4 = tmp1 ^ in0;
+ tmp2 = tmp1 ^ in6;
+ out2 = out4 ^ in3 ^ in7;
+ out0 = tmp2 ^ in4;
+ out5 = tmp2 ^ out3;
+ out1 = tmp0 ^ out5 ^ in7;
+ out6 = tmp0 ^ out2 ^ in4;
+ out7 = tmp1 ^ out6 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D5(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in5;
+ tmp0 = in0 ^ in4;
+ tmp1 = tmp0 ^ in1 ^ in5;
+ out4 = tmp1 ^ in2;
+ out0 = out4 ^ in6;
+ tmp2 = tmp0 ^ out0;
+ out5 = tmp2 ^ in3;
+ out1 = out5 ^ in7;
+ out6 = tmp1 ^ out1;
+ out7 = tmp2 ^ out6;
+ out2 = out7 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D6(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2 ^ in4 ^ in6;
+ out5 = tmp0 ^ in3;
+ out0 = tmp0 ^ in5 ^ in7;
+ out3 = out0 ^ out5 ^ in2;
+ tmp1 = out3 ^ in0;
+ out1 = tmp1 ^ in6;
+ out2 = tmp1 ^ in7;
+ out4 = tmp1 ^ in1;
+ out6 = tmp1 ^ in4;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D7(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in3;
+ out3 = in2 ^ in5 ^ in7;
+ out2 = tmp0 ^ in5;
+ tmp1 = tmp0 ^ out3 ^ in1;
+ out1 = tmp1 ^ in6;
+ out4 = tmp1 ^ in4;
+ tmp2 = out1 ^ in4;
+ out6 = tmp2 ^ in1;
+ out7 = tmp2 ^ in2;
+ out0 = tmp2 ^ in3;
+ out5 = tmp2 ^ in0 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D8(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in0;
+ out5 = in1;
+ tmp0 = in1 ^ in2;
+ out6 = in0 ^ in2;
+ out0 = tmp0 ^ in4;
+ tmp1 = tmp0 ^ in3;
+ out7 = tmp1 ^ out6;
+ out2 = tmp1 ^ in6;
+ out3 = out7 ^ in7;
+ out1 = tmp1 ^ in1 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_D9(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in0 ^ in4;
+ out5 = in1 ^ in5;
+ out2 = in1 ^ in3 ^ in6;
+ out3 = in0 ^ in1 ^ in7;
+ out6 = in0 ^ in2 ^ in6;
+ out0 = out4 ^ in1 ^ in2;
+ out1 = out5 ^ in2 ^ in3;
+ out7 = out3 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_DA(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out5 = in1 ^ in4;
+ tmp0 = in2 ^ in7;
+ tmp1 = in0 ^ in2 ^ in3;
+ out0 = tmp0 ^ out5;
+ out4 = tmp0 ^ tmp1;
+ out2 = tmp0 ^ in3 ^ in6;
+ out1 = tmp1 ^ in5;
+ out3 = tmp1 ^ in1;
+ out6 = out1 ^ in3;
+ out7 = out3 ^ in2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_DB(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in1 ^ in5;
+ tmp2 = in3 ^ in7;
+ out3 = tmp0 ^ in2;
+ out5 = tmp1 ^ in4;
+ out6 = tmp1 ^ out3 ^ in6;
+ out2 = tmp2 ^ in6;
+ tmp3 = tmp2 ^ in4;
+ tmp4 = out3 ^ in3;
+ out4 = tmp3 ^ in0;
+ out1 = tmp4 ^ in5;
+ out0 = tmp3 ^ tmp4;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_DC(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in0 ^ in3;
+ out6 = tmp0 ^ in4;
+ tmp2 = tmp0 ^ in7;
+ out3 = tmp1 ^ in6;
+ tmp3 = tmp1 ^ in1;
+ out1 = tmp1 ^ tmp2 ^ in5;
+ out4 = tmp2 ^ in6;
+ out2 = tmp3 ^ in2;
+ out7 = tmp3 ^ in5;
+ out5 = tmp2 ^ out2;
+ out0 = out2 ^ out3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_DD(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0 ^ in6;
+ out2 = in0 ^ in1 ^ in3;
+ out6 = out3 ^ in2 ^ in4;
+ out7 = out2 ^ in5 ^ in7;
+ out0 = out6 ^ in1;
+ out4 = out6 ^ in7;
+ out5 = out7 ^ in0;
+ out1 = out5 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_DE(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3 ^ in6;
+ tmp1 = in3 ^ in4 ^ in7;
+ out4 = tmp0 ^ in0;
+ out5 = tmp1 ^ in1;
+ out3 = out4 ^ in7;
+ out2 = out3 ^ in6;
+ out1 = out2 ^ in5;
+ out6 = tmp1 ^ out1;
+ out0 = tmp0 ^ out5;
+ out7 = out0 ^ out1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_DF(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0 ^ in3 ^ in7;
+ tmp0 = out2 ^ in1 ^ in5;
+ out1 = tmp0 ^ in2;
+ out7 = tmp0 ^ in6;
+ out5 = tmp0 ^ in0 ^ in4;
+ tmp1 = out1 ^ out5 ^ in6;
+ out4 = tmp1 ^ in3;
+ out6 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in7;
+ out0 = tmp2 ^ in1;
+ out3 = tmp2 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E0(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in1 ^ in7;
+ tmp0 = in2 ^ in4;
+ out4 = out3 ^ in3 ^ in5;
+ out2 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in6;
+ out0 = out4 ^ in2;
+ out6 = out4 ^ in0;
+ out1 = tmp1 ^ in3;
+ out5 = tmp1 ^ in0;
+ out7 = out5 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E1(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in1 ^ in4;
+ tmp0 = in1 ^ in7;
+ out3 = tmp0 ^ in3;
+ tmp1 = out3 ^ in5;
+ out4 = tmp1 ^ in4;
+ tmp2 = tmp1 ^ in0;
+ out0 = tmp2 ^ in2;
+ out6 = tmp2 ^ in6;
+ tmp3 = out0 ^ out4 ^ in6;
+ out5 = tmp3 ^ in5;
+ out7 = tmp0 ^ tmp3;
+ out1 = tmp2 ^ out5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E2(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in1 ^ in2;
+ out4 = in1 ^ in5;
+ out2 = in2 ^ in4 ^ in7;
+ out5 = in0 ^ in2 ^ in6;
+ out0 = out3 ^ in3 ^ in5;
+ out7 = out3 ^ in0 ^ in4;
+ out6 = out2 ^ out7 ^ in3;
+ out1 = out5 ^ in3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E3(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in4 ^ in7;
+ tmp0 = in1 ^ in3;
+ out3 = tmp0 ^ in2;
+ tmp1 = out3 ^ in0;
+ out0 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in4;
+ out1 = tmp2 ^ in6;
+ tmp3 = tmp2 ^ in3;
+ out7 = tmp3 ^ in7;
+ out6 = out1 ^ out2 ^ in2;
+ tmp4 = tmp0 ^ out0;
+ out5 = tmp4 ^ in6;
+ out4 = tmp3 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E4(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in6;
+ tmp0 = in0 ^ in4;
+ tmp1 = tmp0 ^ in2 ^ in6;
+ out2 = tmp1 ^ in1;
+ out7 = out2 ^ in5;
+ tmp2 = tmp0 ^ out7;
+ out4 = tmp2 ^ in3;
+ out0 = out4 ^ in7;
+ out6 = tmp1 ^ out0;
+ out5 = tmp2 ^ out6;
+ out1 = out5 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E5(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in3 ^ in6;
+ tmp0 = in0 ^ in1;
+ tmp1 = in5 ^ in7;
+ out2 = tmp0 ^ in4 ^ in6;
+ tmp2 = tmp1 ^ out2;
+ out6 = tmp2 ^ in3;
+ out7 = tmp2 ^ in2;
+ out0 = out6 ^ in2 ^ in4;
+ out5 = out6 ^ in1 ^ in2;
+ out1 = tmp0 ^ out5 ^ in5;
+ out4 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E6(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in2 ^ in6 ^ in7;
+ out2 = out3 ^ in0 ^ in4;
+ out4 = out3 ^ in1 ^ in5;
+ out1 = out2 ^ in3;
+ out7 = out2 ^ out4 ^ in2;
+ out0 = out4 ^ in3 ^ in7;
+ out5 = out1 ^ in4;
+ out6 = out0 ^ out2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E7(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in3;
+ out3 = tmp0 ^ in6 ^ in7;
+ tmp1 = out3 ^ in0;
+ out5 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in4;
+ tmp3 = out5 ^ in7;
+ out1 = tmp2 ^ in1;
+ out0 = tmp3 ^ in1;
+ out6 = out1 ^ in2;
+ out2 = tmp0 ^ tmp2;
+ tmp4 = tmp3 ^ out6;
+ out4 = tmp4 ^ in6;
+ out7 = tmp4 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E8(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in3 ^ in6;
+ tmp0 = in4 ^ in7;
+ out1 = in2 ^ in3 ^ in4;
+ out5 = tmp0 ^ in0;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp1 ^ in5;
+ out0 = tmp1 ^ out1;
+ out2 = tmp2 ^ in2;
+ out6 = tmp2 ^ out5;
+ tmp3 = out6 ^ in6;
+ out3 = tmp3 ^ in7;
+ out7 = tmp3 ^ in2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_E9(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in3 ^ in6;
+ tmp2 = tmp0 ^ in6;
+ out4 = tmp1 ^ in4;
+ out6 = tmp2 ^ in5;
+ out7 = tmp2 ^ in2 ^ in7;
+ out3 = out6 ^ in3 ^ in7;
+ out0 = tmp1 ^ out7;
+ out2 = out3 ^ out4 ^ in0;
+ out5 = tmp0 ^ out2;
+ out1 = out0 ^ out5 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_EA(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in6 ^ in7;
+ out5 = in0 ^ in7;
+ out6 = in0 ^ in1;
+ out0 = in1 ^ in2 ^ in3;
+ out2 = in2 ^ in4 ^ in5;
+ out7 = out6 ^ in2;
+ out1 = out0 ^ out6 ^ in4;
+ out3 = out7 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_EB(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in4 ^ in5;
+ tmp0 = in0 ^ in1;
+ out4 = in4 ^ in6 ^ in7;
+ out5 = in0 ^ in5 ^ in7;
+ out6 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out0 = tmp1 ^ in3;
+ out7 = tmp1 ^ in7;
+ out1 = out0 ^ in4;
+ out3 = out0 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_EC(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out3 = in0 ^ in5;
+ out4 = in2 ^ in3 ^ in7;
+ out5 = in0 ^ in3 ^ in4;
+ out6 = out3 ^ in1 ^ in4;
+ out1 = out4 ^ in4;
+ out0 = out4 ^ in1 ^ in6;
+ out2 = out0 ^ out5 ^ in5;
+ out7 = out2 ^ in4 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_ED(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = in3 ^ in5;
+ out4 = tmp0 ^ in3 ^ in7;
+ out3 = tmp1 ^ in0;
+ out1 = out4 ^ in1;
+ out5 = out3 ^ in4;
+ out7 = out1 ^ out5 ^ in6;
+ out2 = tmp0 ^ out7;
+ out0 = tmp1 ^ out7;
+ out6 = out2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_EE(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in2;
+ tmp0 = in0 ^ in1;
+ out5 = in0 ^ in3;
+ tmp1 = tmp0 ^ in2;
+ out6 = tmp0 ^ in4;
+ tmp2 = tmp1 ^ out5;
+ out7 = tmp1 ^ in5;
+ out1 = tmp2 ^ out6 ^ in7;
+ out0 = tmp2 ^ in6;
+ tmp3 = out7 ^ in1;
+ out3 = tmp3 ^ in7;
+ out2 = tmp3 ^ in4 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_EF(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out4 = in2 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in4 ^ in6;
+ out5 = tmp0 ^ in3;
+ out2 = tmp0 ^ tmp1;
+ out6 = tmp1 ^ in0 ^ in1;
+ out3 = out5 ^ in2 ^ in7;
+ out7 = out3 ^ in1 ^ in3;
+ out0 = out4 ^ out6 ^ in3;
+ out1 = tmp1 ^ out0 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F0(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in4 ^ in5;
+ out2 = tmp0 ^ in6;
+ out3 = tmp1 ^ in1;
+ tmp2 = tmp1 ^ in7;
+ out1 = out2 ^ out3 ^ in3;
+ tmp3 = tmp0 ^ tmp2;
+ out0 = tmp3 ^ in3;
+ out5 = tmp3 ^ in0;
+ out4 = out1 ^ out5 ^ in4;
+ out7 = out4 ^ in2;
+ out6 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F1(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in1 ^ in6;
+ tmp0 = in3 ^ in5;
+ out3 = tmp0 ^ in1 ^ in4;
+ tmp1 = out3 ^ in2;
+ out1 = tmp1 ^ in6;
+ tmp2 = tmp1 ^ in0;
+ tmp3 = out1 ^ in5;
+ out0 = tmp2 ^ in7;
+ out6 = tmp2 ^ in4;
+ out7 = tmp3 ^ in0;
+ out5 = tmp0 ^ out0;
+ out4 = tmp3 ^ out5 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F2(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in4 ^ in5;
+ out2 = in2 ^ in6 ^ in7;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp1 ^ in2;
+ out0 = tmp2 ^ in3;
+ out3 = tmp2 ^ in7;
+ out5 = out3 ^ in0 ^ in4;
+ tmp3 = tmp0 ^ out5;
+ out7 = tmp3 ^ in3;
+ out4 = tmp3 ^ out2;
+ out1 = out0 ^ out4 ^ in4;
+ out6 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F3(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in6 ^ in7;
+ tmp0 = in0 ^ in1;
+ out4 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out5 = tmp1 ^ in7;
+ out6 = tmp1 ^ in3;
+ out7 = out6 ^ in4;
+ out0 = out7 ^ in5;
+ out1 = out0 ^ in6;
+ out3 = out0 ^ in0 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F4(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0 ^ in1 ^ in2;
+ tmp0 = out2 ^ in3;
+ out4 = tmp0 ^ in4;
+ out5 = out4 ^ in5;
+ out6 = out5 ^ in6;
+ out7 = out6 ^ in7;
+ out0 = out7 ^ in0;
+ out1 = out0 ^ in1;
+ out3 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F5(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0 ^ in1;
+ tmp0 = out2 ^ in2;
+ out4 = tmp0 ^ in3;
+ out5 = out4 ^ in4;
+ out6 = out5 ^ in5;
+ out7 = out6 ^ in6;
+ out0 = out7 ^ in7;
+ out1 = out0 ^ in0;
+ out3 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F6(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in7;
+ out2 = tmp0 ^ in2;
+ out4 = out2 ^ in1 ^ in4;
+ out7 = out4 ^ in3 ^ in5;
+ out5 = out7 ^ in4 ^ in7;
+ out0 = tmp0 ^ out7 ^ in6;
+ tmp1 = out0 ^ in1;
+ out6 = out0 ^ in0 ^ in5;
+ out3 = tmp1 ^ in3;
+ out1 = tmp0 ^ tmp1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F7(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0 ^ in7;
+ tmp0 = out2 ^ in1;
+ out4 = tmp0 ^ in2;
+ out5 = out4 ^ in3 ^ in7;
+ out6 = out5 ^ in4;
+ out7 = out6 ^ in5;
+ out0 = out7 ^ in6;
+ out1 = out0 ^ in7;
+ out3 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F8(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in4;
+ tmp1 = in3 ^ in5;
+ tmp2 = tmp0 ^ in6;
+ out4 = tmp0 ^ tmp1;
+ out1 = tmp1 ^ in2 ^ in4;
+ out3 = tmp2 ^ in1;
+ out5 = out3 ^ in5;
+ out7 = out1 ^ out5 ^ in7;
+ out6 = tmp1 ^ out7;
+ out0 = tmp2 ^ out7;
+ out2 = out6 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_F9(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = in0 ^ in6;
+ out4 = tmp0 ^ in0;
+ tmp2 = tmp1 ^ in4;
+ tmp3 = tmp1 ^ in2;
+ out5 = tmp2 ^ in1;
+ out3 = out5 ^ in3;
+ tmp4 = tmp3 ^ out3;
+ out1 = tmp4 ^ in5;
+ out0 = tmp4 ^ in0 ^ in7;
+ out6 = tmp0 ^ out0 ^ in4;
+ out7 = tmp2 ^ tmp4;
+ out2 = tmp3 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_FA(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = tmp0 ^ in2;
+ tmp2 = tmp0 ^ in5;
+ tmp3 = tmp1 ^ in7;
+ out5 = tmp2 ^ in6;
+ out6 = tmp3 ^ in6;
+ out7 = tmp3 ^ in3;
+ out3 = out6 ^ in4;
+ out2 = tmp1 ^ out5;
+ out4 = out2 ^ out3 ^ in1;
+ out0 = out4 ^ out7 ^ in5;
+ out1 = tmp2 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_FB(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in5 ^ in6;
+ tmp0 = in0 ^ in1;
+ out4 = in0 ^ in5 ^ in7;
+ out5 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out6 = tmp1 ^ in7;
+ out7 = tmp1 ^ in3;
+ out0 = out7 ^ in4;
+ out1 = out0 ^ in5;
+ out3 = out0 ^ in6 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_FC(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in0 ^ in7;
+ out2 = tmp0 ^ tmp1 ^ in5;
+ out3 = tmp1 ^ in4;
+ tmp2 = out2 ^ in6;
+ out6 = tmp2 ^ in4;
+ out7 = tmp2 ^ in3;
+ out4 = out6 ^ in1 ^ in3;
+ tmp3 = out4 ^ in0;
+ out1 = tmp3 ^ in6;
+ out0 = tmp3 ^ in1 ^ in5;
+ out5 = tmp0 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_FD(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in7;
+ out2 = tmp0 ^ tmp1;
+ out6 = out2 ^ in2 ^ in4;
+ tmp2 = out6 ^ in0;
+ out1 = tmp2 ^ in3;
+ out0 = tmp0 ^ out1 ^ in6;
+ out5 = out0 ^ in2;
+ tmp3 = out5 ^ in1;
+ out3 = tmp3 ^ in6;
+ out7 = tmp2 ^ tmp3;
+ out4 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_FE(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ tmp0 = in0 ^ in2;
+ out2 = tmp0 ^ in5;
+ out3 = tmp0 ^ in4;
+ tmp1 = out3 ^ in6;
+ out4 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in1;
+ out6 = tmp2 ^ in7;
+ tmp3 = tmp2 ^ in0;
+ out0 = tmp3 ^ in3;
+ tmp4 = out0 ^ out4 ^ in7;
+ out5 = tmp4 ^ in6;
+ out7 = tmp4 ^ in2;
+ out1 = tmp3 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void gf8_muladd_FF(uint8_t * out, uint8_t * in, unsigned int width)
+{
+ unsigned int i;
+ uint64_t * in_ptr = (uint64_t *)in;
+ uint64_t * out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < width; i++)
+ {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[width];
+ uint64_t in2 = out_ptr[width * 2];
+ uint64_t in3 = out_ptr[width * 3];
+ uint64_t in4 = out_ptr[width * 4];
+ uint64_t in5 = out_ptr[width * 5];
+ uint64_t in6 = out_ptr[width * 6];
+ uint64_t in7 = out_ptr[width * 7];
+
+ out2 = in0 ^ in5;
+ tmp0 = in4 ^ in7;
+ tmp1 = out2 ^ in2;
+ out4 = tmp1 ^ in6;
+ out7 = tmp1 ^ in1 ^ in3;
+ out1 = tmp0 ^ out7;
+ tmp2 = out1 ^ in5;
+ out6 = tmp2 ^ in3;
+ tmp3 = tmp2 ^ in7;
+ out0 = tmp3 ^ in6;
+ out3 = tmp3 ^ in1;
+ out5 = tmp0 ^ out0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[width] = out1 ^ in_ptr[width];
+ out_ptr[width * 2] = out2 ^ in_ptr[width * 2];
+ out_ptr[width * 3] = out3 ^ in_ptr[width * 3];
+ out_ptr[width * 4] = out4 ^ in_ptr[width * 4];
+ out_ptr[width * 5] = out5 ^ in_ptr[width * 5];
+ out_ptr[width * 6] = out6 ^ in_ptr[width * 6];
+ out_ptr[width * 7] = out7 ^ in_ptr[width * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+void (* ec_gf_muladd[])(uint8_t * out, uint8_t * in, unsigned int width) =
+{
+ gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03,
+ gf8_muladd_04, gf8_muladd_05, gf8_muladd_06, gf8_muladd_07,
+ gf8_muladd_08, gf8_muladd_09, gf8_muladd_0A, gf8_muladd_0B,
+ gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E, gf8_muladd_0F,
+ gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13,
+ gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17,
+ gf8_muladd_18, gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B,
+ gf8_muladd_1C, gf8_muladd_1D, gf8_muladd_1E, gf8_muladd_1F,
+ gf8_muladd_20, gf8_muladd_21, gf8_muladd_22, gf8_muladd_23,
+ gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27,
+ gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B,
+ gf8_muladd_2C, gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F,
+ gf8_muladd_30, gf8_muladd_31, gf8_muladd_32, gf8_muladd_33,
+ gf8_muladd_34, gf8_muladd_35, gf8_muladd_36, gf8_muladd_37,
+ gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B,
+ gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F,
+ gf8_muladd_40, gf8_muladd_41, gf8_muladd_42, gf8_muladd_43,
+ gf8_muladd_44, gf8_muladd_45, gf8_muladd_46, gf8_muladd_47,
+ gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A, gf8_muladd_4B,
+ gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F,
+ gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53,
+ gf8_muladd_54, gf8_muladd_55, gf8_muladd_56, gf8_muladd_57,
+ gf8_muladd_58, gf8_muladd_59, gf8_muladd_5A, gf8_muladd_5B,
+ gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E, gf8_muladd_5F,
+ gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63,
+ gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67,
+ gf8_muladd_68, gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B,
+ gf8_muladd_6C, gf8_muladd_6D, gf8_muladd_6E, gf8_muladd_6F,
+ gf8_muladd_70, gf8_muladd_71, gf8_muladd_72, gf8_muladd_73,
+ gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77,
+ gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B,
+ gf8_muladd_7C, gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F,
+ gf8_muladd_80, gf8_muladd_81, gf8_muladd_82, gf8_muladd_83,
+ gf8_muladd_84, gf8_muladd_85, gf8_muladd_86, gf8_muladd_87,
+ gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B,
+ gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F,
+ gf8_muladd_90, gf8_muladd_91, gf8_muladd_92, gf8_muladd_93,
+ gf8_muladd_94, gf8_muladd_95, gf8_muladd_96, gf8_muladd_97,
+ gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A, gf8_muladd_9B,
+ gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F,
+ gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3,
+ gf8_muladd_A4, gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7,
+ gf8_muladd_A8, gf8_muladd_A9, gf8_muladd_AA, gf8_muladd_AB,
+ gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE, gf8_muladd_AF,
+ gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3,
+ gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7,
+ gf8_muladd_B8, gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB,
+ gf8_muladd_BC, gf8_muladd_BD, gf8_muladd_BE, gf8_muladd_BF,
+ gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2, gf8_muladd_C3,
+ gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7,
+ gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB,
+ gf8_muladd_CC, gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF,
+ gf8_muladd_D0, gf8_muladd_D1, gf8_muladd_D2, gf8_muladd_D3,
+ gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6, gf8_muladd_D7,
+ gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB,
+ gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF,
+ gf8_muladd_E0, gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3,
+ gf8_muladd_E4, gf8_muladd_E5, gf8_muladd_E6, gf8_muladd_E7,
+ gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA, gf8_muladd_EB,
+ gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF,
+ gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3,
+ gf8_muladd_F4, gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7,
+ gf8_muladd_F8, gf8_muladd_F9, gf8_muladd_FA, gf8_muladd_FB,
+ gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE, gf8_muladd_FF
};
diff --git a/xlators/cluster/ec/src/ec-gf.h b/xlators/cluster/ec/src/ec-gf.h
index 664feb46ce5..2563a203f08 100644
--- a/xlators/cluster/ec/src/ec-gf.h
+++ b/xlators/cluster/ec/src/ec-gf.h
@@ -18,97 +18,16 @@
<http://www.gnu.org/licenses/>.
*/
-/*
- * File automatically generated on Thu Jan 26 12:08:19 2012
- *
- * DO NOT MODIFY
- *
- * Multiplications in a GF(2^8) with modulus 0x11D using XOR's
- *
- */
-
-#ifndef __EC_GF_H__
-#define __EC_GF_H__
+#ifndef __EC_GF8_H__
+#define __EC_GF8_H__
#define EC_GF_BITS 8
#define EC_GF_MOD 0x11D
-#define ec_gf_load(addr) \
- do \
- { \
- __asm__ __volatile__ \
- ( \
- "\tmovdqa 0*16(%0), %%xmm0\n" \
- "\tmovdqa 1*16(%0), %%xmm1\n" \
- "\tmovdqa 2*16(%0), %%xmm2\n" \
- "\tmovdqa 3*16(%0), %%xmm3\n" \
- "\tmovdqa 4*16(%0), %%xmm4\n" \
- "\tmovdqa 5*16(%0), %%xmm5\n" \
- "\tmovdqa 6*16(%0), %%xmm6\n" \
- "\tmovdqa 7*16(%0), %%xmm7\n" \
- : \
- : "r" (addr) \
- : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
- ); \
- } while (0)
-
-#define ec_gf_store(addr) \
- do \
- { \
- __asm__ __volatile__ \
- ( \
- "\tmovdqa %%xmm0, 0*16(%0)\n" \
- "\tmovdqa %%xmm1, 1*16(%0)\n" \
- "\tmovdqa %%xmm2, 2*16(%0)\n" \
- "\tmovdqa %%xmm3, 3*16(%0)\n" \
- "\tmovdqa %%xmm4, 4*16(%0)\n" \
- "\tmovdqa %%xmm5, 5*16(%0)\n" \
- "\tmovdqa %%xmm6, 6*16(%0)\n" \
- "\tmovdqa %%xmm7, 7*16(%0)\n" \
- : \
- : "r" (addr) \
- : "memory" \
- ); \
- } while (0)
-
-#define ec_gf_clear() \
- do \
- { \
- __asm__ __volatile__ \
- ( \
- "\tpxor %xmm0, %xmm0\n" \
- "\tpxor %xmm1, %xmm1\n" \
- "\tpxor %xmm2, %xmm2\n" \
- "\tpxor %xmm3, %xmm3\n" \
- "\tpxor %xmm4, %xmm4\n" \
- "\tpxor %xmm5, %xmm5\n" \
- "\tpxor %xmm6, %xmm6\n" \
- "\tpxor %xmm7, %xmm7\n" \
- : \
- : \
- : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
- ); \
- } while (0)
-
-#define ec_gf_xor(addr) \
- do \
- { \
- __asm__ __volatile__ \
- ( \
- "\tpxor 0*16(%0), %%xmm0\n" \
- "\tpxor 1*16(%0), %%xmm1\n" \
- "\tpxor 2*16(%0), %%xmm2\n" \
- "\tpxor 3*16(%0), %%xmm3\n" \
- "\tpxor 4*16(%0), %%xmm4\n" \
- "\tpxor 5*16(%0), %%xmm5\n" \
- "\tpxor 6*16(%0), %%xmm6\n" \
- "\tpxor 7*16(%0), %%xmm7\n" \
- : \
- : "r" (addr) \
- : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
- ); \
- } while (0)
+#define EC_GF_SIZE (1 << EC_GF_BITS)
+#define EC_GF_WORD_SIZE sizeof(uint64_t)
-extern void (* ec_gf_mul_table[])(void);
+extern void (* ec_gf_muladd[])(uint8_t * out, uint8_t * in,
+ unsigned int width);
-#endif /* __EC_GF_H__ */
+#endif /* __EC_GF8_H__ */
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index 484c5e40189..a31220ecbc1 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -975,7 +975,7 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
ec_cbk_data_t * ans = NULL;
struct iobref * iobref = NULL;
struct iobuf * iobuf = NULL;
- uint8_t * ptr = NULL, * buff = NULL;
+ uint8_t * buff = NULL, * ptr;
size_t fsize = 0, size = 0, max = 0;
int32_t i = 0;
@@ -994,17 +994,17 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
fsize = cbk->op_ret;
size = fsize * ec->fragments;
- ptr = GF_MALLOC(size + EC_BUFFER_ALIGN_SIZE - 1, gf_common_mt_char);
- if (ptr == NULL)
+ buff = GF_MALLOC(size, gf_common_mt_char);
+ if (buff == NULL)
{
goto out;
}
- buff = GF_ALIGN_BUF(ptr, EC_BUFFER_ALIGN_SIZE);
+ ptr = buff;
for (i = 0, ans = cbk; ans != NULL; i++, ans = ans->next)
{
values[i] = ans->idx;
- blocks[i] = buff;
- buff += ec_iov_copy_to(buff, ans->vector, ans->int32, 0, fsize);
+ blocks[i] = ptr;
+ ptr += ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize);
}
iobref = iobref_new();
@@ -1028,8 +1028,8 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
iobuf_unref(iobuf);
- GF_FREE(ptr);
- ptr = NULL;
+ GF_FREE(buff);
+ buff = NULL;
vector[0].iov_base += fop->head;
vector[0].iov_len -= fop->head;
@@ -1079,7 +1079,7 @@ out:
{
iobref_unref(iobref);
}
- GF_FREE(ptr);
+ GF_FREE(buff);
return 0;
}
diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c
index 83b603bd14a..8dd5a8b88b9 100644
--- a/xlators/cluster/ec/src/ec-method.c
+++ b/xlators/cluster/ec/src/ec-method.c
@@ -21,28 +21,27 @@
#include <string.h>
#include <inttypes.h>
+#include "ec-gf.h"
#include "ec-method.h"
-#define EC_METHOD_WORD_SIZE 16
-
-static uint32_t GfPow[EC_METHOD_SIZE << 1];
-static uint32_t GfLog[EC_METHOD_SIZE << 1];
+static uint32_t GfPow[EC_GF_SIZE << 1];
+static uint32_t GfLog[EC_GF_SIZE << 1];
void ec_method_initialize(void)
{
uint32_t i;
GfPow[0] = 1;
- GfLog[0] = EC_METHOD_SIZE;
- for (i = 1; i < EC_METHOD_SIZE; i++)
+ GfLog[0] = EC_GF_SIZE;
+ for (i = 1; i < EC_GF_SIZE; i++)
{
GfPow[i] = GfPow[i - 1] << 1;
- if (GfPow[i] >= EC_METHOD_SIZE)
+ if (GfPow[i] >= EC_GF_SIZE)
{
GfPow[i] ^= EC_GF_MOD;
}
- GfPow[i + EC_METHOD_SIZE - 1] = GfPow[i];
- GfLog[GfPow[i] + EC_METHOD_SIZE - 1] = GfLog[GfPow[i]] = i;
+ GfPow[i + EC_GF_SIZE - 1] = GfPow[i];
+ GfLog[GfPow[i] + EC_GF_SIZE - 1] = GfLog[GfPow[i]] = i;
}
}
@@ -61,11 +60,11 @@ static uint32_t ec_method_div(uint32_t a, uint32_t b)
{
if (a)
{
- return GfPow[EC_METHOD_SIZE - 1 + GfLog[a] - GfLog[b]];
+ return GfPow[EC_GF_SIZE - 1 + GfLog[a] - GfLog[b]];
}
return 0;
}
- return EC_METHOD_SIZE;
+ return EC_GF_SIZE;
}
size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row,
@@ -77,15 +76,13 @@ size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row,
row++;
for (j = 0; j < size; j++)
{
- ec_gf_load(in);
+ ec_gf_muladd[0](out, in, EC_METHOD_WIDTH);
in += EC_METHOD_CHUNK_SIZE;
for (i = 1; i < columns; i++)
{
- ec_gf_mul_table[row]();
- ec_gf_xor(in);
+ ec_gf_muladd[row](out, in, EC_METHOD_WIDTH);
in += EC_METHOD_CHUNK_SIZE;
}
- ec_gf_store(out);
out += EC_METHOD_CHUNK_SIZE;
}
@@ -95,31 +92,29 @@ size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row,
size_t ec_method_decode(size_t size, uint32_t columns, uint32_t * rows,
uint8_t ** in, uint8_t * out)
{
- uint32_t i, j, k;
- uint32_t f, off;
+ uint32_t i, j, k, off, last, value;
+ uint32_t f;
uint8_t inv[EC_METHOD_MAX_FRAGMENTS][EC_METHOD_MAX_FRAGMENTS + 1];
uint8_t mtx[EC_METHOD_MAX_FRAGMENTS][EC_METHOD_MAX_FRAGMENTS];
- uint8_t * p[EC_METHOD_MAX_FRAGMENTS];
+ uint8_t dummy[EC_METHOD_CHUNK_SIZE];
size /= EC_METHOD_CHUNK_SIZE;
memset(inv, 0, sizeof(inv));
memset(mtx, 0, sizeof(mtx));
+ memset(dummy, 0, sizeof(dummy));
for (i = 0; i < columns; i++)
{
inv[i][i] = 1;
inv[i][columns] = 1;
}
- k = 0;
for (i = 0; i < columns; i++)
{
- mtx[k][columns - 1] = 1;
+ mtx[i][columns - 1] = 1;
for (j = columns - 1; j > 0; j--)
{
- mtx[k][j - 1] = ec_method_mul(mtx[k][j], rows[i] + 1);
+ mtx[i][j - 1] = ec_method_mul(mtx[i][j], rows[i] + 1);
}
- p[k] = in[i];
- k++;
}
for (i = 0; i < columns; i++)
@@ -148,25 +143,24 @@ size_t ec_method_decode(size_t size, uint32_t columns, uint32_t * rows,
{
for (i = 0; i < columns; i++)
{
- ec_gf_load(p[0] + off);
+ last = 0;
j = 0;
- while (j < columns)
+ do
{
- k = j + 1;
- while (inv[i][k] == 0)
+ while (inv[i][j] == 0)
{
- k++;
+ j++;
}
- ec_gf_mul_table[ec_method_div(inv[i][j], inv[i][k])]();
- if (k < columns)
+ if (j < columns)
{
- ec_gf_xor(p[k] + off);
+ value = ec_method_div(last, inv[i][j]);
+ last = inv[i][j];
+ ec_gf_muladd[value](out, in[j] + off, EC_METHOD_WIDTH);
+ j++;
}
- j = k;
- }
- ec_gf_store(out);
+ } while (j < columns);
+ ec_gf_muladd[last](out, dummy, EC_METHOD_WIDTH);
out += EC_METHOD_CHUNK_SIZE;
- in[i] += EC_METHOD_CHUNK_SIZE;
}
off += EC_METHOD_CHUNK_SIZE;
}
diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h
index 64e724ff3f0..57e81a51027 100644
--- a/xlators/cluster/ec/src/ec-method.h
+++ b/xlators/cluster/ec/src/ec-method.h
@@ -25,13 +25,10 @@
#define EC_METHOD_MAX_FRAGMENTS 16
-#define EC_METHOD_WORD_SIZE 16
+#define EC_METHOD_WORD_SIZE 64
-#define EC_BUFFER_ALIGN_SIZE EC_METHOD_WORD_SIZE
-
-#define EC_METHOD_BITS EC_GF_BITS
-#define EC_METHOD_SIZE (1 << (EC_METHOD_BITS))
-#define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_METHOD_BITS)
+#define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_GF_BITS)
+#define EC_METHOD_WIDTH (EC_METHOD_WORD_SIZE / EC_GF_WORD_SIZE)
void ec_method_initialize(void);
size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row,