diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2015-12-01 11:50:54 +0100 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2016-09-08 10:08:25 -0700 |
commit | 593b7a83f7408e59ab7b3ef7dfc4fe4096d6e3cd (patch) | |
tree | 1a60ff784e7759c5dddb2352c5526f16e8696013 | |
parent | 8a6236e4cfc7bf86e881e5c770a19040a6060ad0 (diff) |
cluster/ec: Add support for hardware acceleration
This patch implements functionalities for fast encoding/decoding
using hardware support. Currently optimized x86_64, SSE and AVX is
added.
Additionally this patch implements a caching mecanism for inverse
matrices to reduce computation time, as well as a new method for
computing the inverse that takes quadratic time instead of cubic.
Finally some unnecessary memory copies have been eliminated to
further increase performance.
Change-Id: I26c75f26fb4201bd22b51335448ea4357235065a
BUG: 1289922
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/12837
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
41 files changed, 21260 insertions, 12302 deletions
diff --git a/configure.ac b/configure.ac index 0a7c6ddf69c..f66ca1577f9 100644 --- a/configure.ac +++ b/configure.ac @@ -1346,6 +1346,93 @@ fi AM_CONDITIONAL([ENABLE_EXPERIMENTAL], [test x$BUILD_EXPERIMENTAL = xyes]) #end experimental section +# EC dynamic code generation section + +EC_DYNAMIC_SUPPORT="none" +EC_DYNAMIC_ARCH="none" + +AC_ARG_ENABLE([ec-dynamic], + AC_HELP_STRING([--disable-ec-dynamic], + [Disable all dynamic code generation extensions for EC module])) + +AC_ARG_ENABLE([ec-dynamic-intel], + AC_HELP_STRING([--disable-ec-dynamic-intel], + [Disable all INTEL dynamic code generation extensions for EC module])) + +AC_ARG_ENABLE([ec-dynamic-arm], + AC_HELP_STRING([--disable-ec-dynamic-arm], + [Disable all ARM dynamic code generation extensions for EC module])) + +AC_ARG_ENABLE([ec-dynamic-x64], + AC_HELP_STRING([--disable-ec-dynamic-x64], + [Disable dynamic INTEL x64 code generation for EC module])) + +AC_ARG_ENABLE([ec-dynamic-sse], + AC_HELP_STRING([--disable-ec-dynamic-sse], + [Disable dynamic INTEL SSE code generation for EC module])) + +AC_ARG_ENABLE([ec-dynamic-avx], + AC_HELP_STRING([--disable-ec-dynamic-avx], + [Disable dynamic INTEL AVX code generation for EC module])) + +AC_ARG_ENABLE([ec-dynamic-neon], + AC_HELP_STRING([--disable-ec-dynamic-neon], + [Disable dynamic ARM NEON code generation for EC module])) + +if test "x$enable_ec_dynamic" != "xno"; then + case $host in + x86_64*) + if test "x$enable_ec_dynamic_intel" != "xno"; then + if test "x$enable_ec_dynamic_x64" != "xno"; then + EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT x64" + AC_DEFINE(USE_EC_DYNAMIC_X64, 1, [Defined if using dynamic INTEL x64 code]) + fi + if test "x$enable_ec_dynamic_sse" != "xno"; then + EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT sse" + AC_DEFINE(USE_EC_DYNAMIC_SSE, 1, [Defined if using dynamic INTEL SSE code]) + fi + if test "x$enable_ec_dynamic_avx" != "xno"; then + EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT avx" + AC_DEFINE(USE_EC_DYNAMIC_AVX, 1, [Defined if using dynamic INTEL AVX code]) + fi + + if test "x$EC_DYNAMIC_SUPPORT" != "xnone"; then + EC_DYNAMIC_ARCH="intel" + fi + fi + ;; + arm*) + if test "x$enable_ec_dynamic_arm" != "xno"; then + if test "x$enable_ec_dynamic_neon" != "xno"; then + EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT neon" + AC_DEFINE(USE_EC_DYNAMIC_NEON, 1, [Defined if using dynamic ARM NEON code]) + fi + + if test "x$EC_DYNAMIC_SUPPORT" != "xnone"; then + EC_DYNAMIC_ARCH="arm" + fi + fi + ;; + esac + + EC_DYNAMIC_SUPPORT="${EC_DYNAMIC_SUPPORT#none }" +fi + +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_INTEL], [test "x$EC_DYNAMIC_ARCH" = "xintel"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_ARM], [test "x$EC_DYNAMIC_ARCH" = "xarm"]) + +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_X64], [test "x${EC_DYNAMIC_SUPPORT##*x64*}" = "x"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_SSE], [test "x${EC_DYNAMIC_SUPPORT##*sse*}" = "x"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_AVX], [test "x${EC_DYNAMIC_SUPPORT##*avx*}" = "x"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_NEON], [test "x${EC_DYNAMIC_SUPPORT##*neon*}" = "x"]) + +AC_SUBST(USE_EC_DYNAMIC_X64) +AC_SUBST(USE_EC_DYNAMIC_SSE) +AC_SUBST(USE_EC_DYNAMIC_AVX) +AC_SUBST(USE_EC_DYNAMIC_NEON) + +# end EC dynamic code generation section + dnl libglusterfs.so uses math functions GF_LDADD="${GF_LDADD} ${MATH_LIB}" @@ -1442,4 +1529,5 @@ echo "Data Classification : $BUILD_GFDB" echo "firewalld-config : $BUILD_FIREWALLD" echo "Experimental xlators : $BUILD_EXPERIMENTAL" echo "Events : $BUILD_EVENTS" +echo "EC dynamic support : $EC_DYNAMIC_SUPPORT" echo diff --git a/tests/basic/ec/ec-cpu-extensions.t b/tests/basic/ec/ec-cpu-extensions.t new file mode 100644 index 00000000000..a599a316925 --- /dev/null +++ b/tests/basic/ec/ec-cpu-extensions.t @@ -0,0 +1,61 @@ +#!/bin/bash + +DISPERSE=6 +REDUNDANCY=2 + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +TESTS_EXPECTED_IN_LOOP=96 + +function check_contents +{ + local src=$1 + local cs=$2 + + TEST cp $src $M0/file + TEST [ -f $M0/file ] + + for ext in none x64 sse avx; do + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume set $V0 disperse.cpu-extensions $ext + TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 + EXPECT_WITHIN $CHILD_UP_TIMEOUT "$DISPERSE" ec_child_up_count $V0 0 + + EXPECT "$cs" echo $(sha1sum $M0/file | awk '{ print $1 }') + done + + TEST rm -f $M0/file +} + +cleanup + +tmp=`mktemp -p ${LOGDIR} -d -t ${0##*/}.XXXXXX` +if [ ! -d $tmp ]; then + exit 1 +fi + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy $REDUNDANCY $H0:$B0/${V0}{1..$DISPERSE} +TEST $CLI volume set $V0 performance.flush-behind off +EXPECT 'Created' volinfo_field $V0 'Status' +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Started' volinfo_field $V0 'Status' + +TEST dd if=/dev/urandom of=$tmp/file bs=1048576 count=1 +cs_file=$(sha1sum $tmp/file | awk '{ print $1 }') + +for ext in none x64 sse avx; do + TEST $CLI volume set $V0 disperse.cpu-extensions $ext + TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 + EXPECT_WITHIN $CHILD_UP_TIMEOUT "$DISPERSE" ec_child_up_count $V0 0 + + check_contents $tmp/file $cs_file + + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +done + +TEST rm -rf $tmp + +cleanup diff --git a/xlators/cluster/ec/src/Makefile.am b/xlators/cluster/ec/src/Makefile.am index cbdceefdbe0..c5d9ab1812b 100644 --- a/xlators/cluster/ec/src/Makefile.am +++ b/xlators/cluster/ec/src/Makefile.am @@ -12,8 +12,11 @@ ec_sources += ec-dir-write.c ec_sources += ec-inode-read.c ec_sources += ec-inode-write.c ec_sources += ec-combine.c -ec_sources += ec-gf.c ec_sources += ec-method.c +ec_sources += ec-galois.c +ec_sources += ec-code.c +ec_sources += ec-code-c.c +ec_sources += ec-gf8.c ec_sources += ec-heal.c ec_sources += ec-heald.c @@ -24,10 +27,34 @@ ec_headers += ec-data.h ec_headers += ec-fops.h ec_headers += ec-common.h ec_headers += ec-combine.h -ec_headers += ec-gf.h ec_headers += ec-method.h +ec_headers += ec-galois.h +ec_headers += ec-code.h +ec_headers += ec-code-c.h +ec_headers += ec-gf8.h ec_headers += ec-heald.h ec_headers += ec-messages.h +ec_headers += ec-types.h + +if ENABLE_EC_DYNAMIC_INTEL + ec_sources += ec-code-intel.c + ec_headers += ec-code-intel.h +endif + +if ENABLE_EC_DYNAMIC_X64 + ec_sources += ec-code-x64.c + ec_headers += ec-code-x64.h +endif + +if ENABLE_EC_DYNAMIC_SSE + ec_sources += ec-code-sse.c + ec_headers += ec-code-sse.h +endif + +if ENABLE_EC_DYNAMIC_AVX + ec_sources += ec-code-avx.c + ec_headers += ec-code-avx.h +endif ec_ext_sources = $(top_builddir)/xlators/lib/src/libxlator.c diff --git a/xlators/cluster/ec/src/ec-code-avx.c b/xlators/cluster/ec/src/ec-code-avx.c new file mode 100644 index 00000000000..92bd3e83c5e --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-avx.c @@ -0,0 +1,116 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_avx_prolog(ec_code_builder_t *builder) +{ + builder->loop = builder->address; +} + +static void +ec_code_avx_epilog(ec_code_builder_t *builder) +{ + ec_code_intel_op_add_i2r(builder, 32, REG_DX); + ec_code_intel_op_add_i2r(builder, 32, REG_DI); + ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); + ec_code_intel_op_jne(builder, builder->loop); + + ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_avx_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_mov_m2avx(builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + + bit * builder->width, + dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_mov_m2avx(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static void +ec_code_avx_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ + ec_code_intel_op_mov_avx2m(builder, src, REG_DI, REG_NULL, 0, + bit * builder->width); +} + +static void +ec_code_avx_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_mov_avx2avx(builder, src, dst); +} + +static void +ec_code_avx_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_xor_avx2avx(builder, src, dst); +} + +static void +ec_code_avx_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, + uint32_t src2) +{ + ec_code_intel_op_mov_avx2avx(builder, src1, dst); + ec_code_intel_op_xor_avx2avx(builder, src2, dst); +} + +static void +ec_code_avx_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_xor_m2avx(builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + + bit * builder->width, + dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_xor_m2avx(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static char *ec_code_avx_needed_flags[] = { + "avx2", + NULL +}; + +ec_code_gen_t ec_code_gen_avx = { + .name = "avx", + .flags = ec_code_avx_needed_flags, + .width = 32, + .prolog = ec_code_avx_prolog, + .epilog = ec_code_avx_epilog, + .load = ec_code_avx_load, + .store = ec_code_avx_store, + .copy = ec_code_avx_copy, + .xor2 = ec_code_avx_xor2, + .xor3 = ec_code_avx_xor3, + .xorm = ec_code_avx_xorm +}; diff --git a/xlators/cluster/ec/src/ec-code-avx.h b/xlators/cluster/ec/src/ec-code-avx.h new file mode 100644 index 00000000000..fdca4ad2c8f --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-avx.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_AVX_H__ +#define __EC_CODE_AVX_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_avx; + +#endif /* __EC_CODE_AVX_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-c.c b/xlators/cluster/ec/src/ec-code-c.c new file mode 100644 index 00000000000..7387f3ea435 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-c.c @@ -0,0 +1,11431 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <inttypes.h> +#include <string.h> + +#include "ec-method.h" +#include "ec-code-c.h" + +#define WIDTH (EC_METHOD_WORD_SIZE / sizeof(uint64_t)) + +static void gf8_muladd_00(void *out, void *in) +{ + memcpy(out, in, EC_METHOD_WORD_SIZE * 8); +} + +static void gf8_muladd_01(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + out_ptr[0] ^= in_ptr[0]; + out_ptr[WIDTH] ^= in_ptr[WIDTH]; + out_ptr[WIDTH * 2] ^= in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] ^= in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] ^= in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] ^= in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] ^= in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] ^= in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_02(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in7; + out1 = in0; + out7 = in6; + out5 = in4; + out6 = in5; + out3 = in2 ^ in7; + out4 = in3 ^ in7; + out2 = in1 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_03(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in7; + tmp0 = in2 ^ in7; + out1 = in0 ^ in1; + out7 = in6 ^ in7; + out5 = in4 ^ in5; + out6 = in5 ^ in6; + out4 = in3 ^ in4 ^ in7; + out2 = tmp0 ^ in1; + out3 = tmp0 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_04(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in6; + out1 = in7; + out7 = in5; + out6 = in4; + tmp0 = in6 ^ in7; + out2 = in0 ^ in6; + out5 = in3 ^ in7; + out3 = tmp0 ^ in1; + out4 = tmp0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_05(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in6; + out1 = in1 ^ in7; + out7 = in5 ^ in7; + out6 = in4 ^ in6; + out2 = out0 ^ in2; + out3 = out1 ^ in3 ^ in6; + out5 = out7 ^ in3; + out4 = out6 ^ in2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_06(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in6 ^ in7; + tmp0 = in1 ^ in6; + out1 = in0 ^ in7; + out7 = in5 ^ in6; + out6 = in4 ^ in5; + out4 = in2 ^ in3 ^ in6; + out5 = in3 ^ in4 ^ in7; + out3 = tmp0 ^ in2; + out2 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_07(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + tmp1 = in5 ^ in6; + tmp2 = in0 ^ in7; + tmp3 = tmp0 ^ in3; + out6 = tmp1 ^ in4; + out7 = tmp1 ^ in7; + out0 = tmp2 ^ in6; + out1 = tmp2 ^ in1; + out3 = tmp3 ^ in1; + out4 = tmp3 ^ in4; + out5 = out4 ^ out7 ^ in2; + out2 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_08(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in5; + out1 = in6; + out7 = in4; + out6 = in3 ^ in7; + out3 = in0 ^ in5 ^ in6; + out5 = in2 ^ in6 ^ in7; + out2 = in5 ^ in7; + out4 = out2 ^ in1 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_09(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in5; + tmp0 = in3 ^ in6; + out1 = in1 ^ in6; + out7 = in4 ^ in7; + out2 = in2 ^ in5 ^ in7; + out3 = tmp0 ^ out0; + out6 = tmp0 ^ in7; + out4 = out1 ^ out7 ^ in5; + out5 = out2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_0A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in5 ^ in7; + out1 = in0 ^ in6; + out7 = in4 ^ in6; + out2 = in1 ^ in5; + out6 = out0 ^ in3; + out3 = out0 ^ out1 ^ in2; + out5 = out7 ^ in2 ^ in7; + out4 = out2 ^ in3 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_0B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in0 ^ in6; + tmp2 = in4 ^ in7; + out0 = in0 ^ in5 ^ in7; + out2 = tmp0 ^ in1; + out1 = tmp1 ^ in1; + out6 = tmp1 ^ out0 ^ in3; + out7 = tmp2 ^ in6; + out4 = tmp2 ^ out6 ^ in1; + out3 = out6 ^ in0 ^ in2; + out5 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_0C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in5 ^ in6; + out1 = in6 ^ in7; + out7 = in4 ^ in5; + tmp0 = in1 ^ in5; + tmp1 = in0 ^ in7; + out5 = in2 ^ in3 ^ in6; + out6 = in3 ^ in4 ^ in7; + out2 = tmp1 ^ out0; + out4 = tmp0 ^ in2; + out3 = tmp0 ^ tmp1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_0D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in5; + tmp1 = in5 ^ in6; + out1 = in1 ^ in6 ^ in7; + out7 = tmp0 ^ in7; + out4 = tmp0 ^ in1 ^ in2; + out0 = tmp1 ^ in0; + tmp2 = tmp1 ^ in3; + out6 = tmp2 ^ out7; + out2 = out0 ^ in2 ^ in7; + out3 = out0 ^ out1 ^ in3; + out5 = tmp2 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_0E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in5; + tmp2 = in5 ^ in6; + out1 = in0 ^ in6 ^ in7; + out3 = tmp0 ^ tmp1; + out2 = tmp0 ^ tmp2; + tmp3 = tmp1 ^ in3; + out7 = tmp2 ^ in4; + out0 = tmp2 ^ in7; + out4 = tmp3 ^ in1 ^ in7; + out5 = tmp3 ^ out7; + out6 = out0 ^ out5 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_0F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in6 ^ in7; + tmp1 = tmp0 ^ in1; + tmp2 = tmp0 ^ in5; + out1 = tmp1 ^ in0; + out7 = tmp2 ^ in4; + out0 = tmp2 ^ in0; + out6 = out7 ^ in3; + out5 = out6 ^ in2 ^ in7; + tmp3 = tmp1 ^ out0 ^ in2; + out4 = tmp1 ^ out5; + out2 = tmp3 ^ in6; + out3 = tmp3 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_10(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4; + out1 = in5; + out7 = in3 ^ in7; + tmp0 = in6 ^ in7; + out2 = in4 ^ in6; + tmp1 = out2 ^ in5; + out6 = tmp0 ^ in2; + out3 = tmp0 ^ tmp1; + out5 = out2 ^ out3 ^ in1; + out4 = tmp1 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_11(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in3; + out0 = in0 ^ in4; + out1 = in1 ^ in5; + out6 = in2 ^ in7; + out4 = in0 ^ in5 ^ in6; + out5 = in1 ^ in6 ^ in7; + out2 = in2 ^ in4 ^ in6; + out3 = in3 ^ in4 ^ in5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_12(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4 ^ in7; + out1 = in0 ^ in5; + out3 = in2 ^ in4 ^ in5; + tmp0 = out0 ^ in6; + out2 = tmp0 ^ in1; + tmp1 = tmp0 ^ in3; + out6 = tmp0 ^ out3; + out5 = out2 ^ in5; + out7 = tmp1 ^ in4; + out4 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_13(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in3 ^ in6; + tmp0 = in0 ^ in5; + tmp1 = in4 ^ in7; + out6 = in2 ^ in5 ^ in7; + out4 = tmp0 ^ out7 ^ in7; + out1 = tmp0 ^ in1; + out0 = tmp1 ^ in0; + out5 = tmp1 ^ in1 ^ in6; + out3 = tmp1 ^ out6 ^ in3; + out2 = out5 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_14(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4 ^ in6; + out1 = in5 ^ in7; + out2 = in0 ^ in4; + tmp0 = out0 ^ in5; + out7 = out1 ^ in3; + tmp1 = out1 ^ in2; + out3 = tmp0 ^ in1; + out6 = tmp0 ^ tmp1; + out4 = tmp1 ^ out2; + out5 = out3 ^ in3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_15(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in3 ^ in5; + tmp0 = in0 ^ in4; + out1 = in1 ^ in5 ^ in7; + out5 = in1 ^ in3 ^ in6; + out0 = tmp0 ^ in6; + out2 = tmp0 ^ in2; + out3 = out5 ^ in4 ^ in5; + out6 = out2 ^ in0 ^ in7; + out4 = tmp0 ^ out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_16(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in4 ^ in7; + tmp2 = in2 ^ in3 ^ in4; + out1 = tmp0 ^ in7; + out4 = tmp0 ^ tmp2; + out0 = tmp1 ^ in6; + tmp3 = tmp1 ^ in1; + out6 = out0 ^ in2 ^ in5; + out2 = tmp3 ^ in0; + out3 = out6 ^ in1; + out7 = tmp2 ^ out6; + out5 = tmp3 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_17(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in3 ^ in6; + tmp2 = tmp0 ^ in4; + out4 = tmp0 ^ in0 ^ in3; + out7 = tmp1 ^ in5; + tmp3 = tmp1 ^ in1; + out6 = tmp2 ^ in7; + out5 = tmp3 ^ in4; + out3 = tmp3 ^ out6; + out0 = out3 ^ out4 ^ in1; + out2 = out3 ^ out7 ^ in0; + out1 = tmp2 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_18(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4 ^ in5; + out1 = in5 ^ in6; + tmp0 = in4 ^ in7; + out5 = in1 ^ in2 ^ in5; + out6 = in2 ^ in3 ^ in6; + out2 = tmp0 ^ out1; + out7 = tmp0 ^ in3; + tmp1 = tmp0 ^ in0; + out3 = tmp1 ^ in6; + out4 = tmp1 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_19(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in1 ^ in2; + out7 = in3 ^ in4; + tmp0 = in0 ^ in7; + out6 = in2 ^ in3; + out1 = in1 ^ in5 ^ in6; + out0 = in0 ^ in4 ^ in5; + out4 = tmp0 ^ in1; + tmp1 = tmp0 ^ in6; + out2 = tmp1 ^ out0 ^ in2; + out3 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_1A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in5; + tmp1 = in5 ^ in6; + tmp2 = tmp0 ^ in1; + out0 = tmp0 ^ in7; + out1 = tmp1 ^ in0; + tmp3 = tmp1 ^ in3; + out5 = tmp2 ^ in2; + out2 = tmp2 ^ in6; + out7 = tmp3 ^ out0; + out6 = tmp3 ^ in2; + out4 = tmp3 ^ out2 ^ in0; + out3 = tmp0 ^ out1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_1B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = in2 ^ in5; + tmp2 = in3 ^ in6; + out5 = tmp0 ^ in1; + tmp3 = tmp0 ^ in0; + out6 = tmp1 ^ in3; + out0 = tmp1 ^ tmp3 ^ in7; + out7 = tmp2 ^ in4; + tmp4 = out5 ^ in6; + out3 = tmp2 ^ tmp3; + out2 = tmp4 ^ in5; + out4 = tmp4 ^ out3; + out1 = tmp3 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_1C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in4 ^ in6; + tmp2 = in5 ^ in7; + out6 = tmp0 ^ tmp1; + out0 = tmp1 ^ in5; + out1 = tmp2 ^ in6; + tmp3 = tmp2 ^ in1; + tmp4 = tmp2 ^ in4; + out2 = tmp4 ^ in0; + out7 = tmp4 ^ in3; + out5 = tmp0 ^ tmp3; + out3 = tmp3 ^ out2; + out4 = out3 ^ in2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_1D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in0 ^ in4; + tmp2 = in3 ^ in4; + tmp3 = in2 ^ in7; + out3 = tmp0 ^ tmp1; + out5 = tmp0 ^ tmp3; + tmp4 = tmp1 ^ in5; + out6 = tmp2 ^ in2; + out7 = tmp2 ^ in5; + out2 = tmp3 ^ tmp4; + out4 = out3 ^ out6 ^ in6; + out0 = tmp4 ^ in6; + out1 = out2 ^ out4 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_1E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in1; + out3 = tmp1 ^ tmp2; + out2 = tmp2 ^ in5; + out4 = out3 ^ in3 ^ in6; + tmp3 = out4 ^ in7; + out6 = tmp3 ^ out2 ^ in4; + out7 = tmp1 ^ out6; + out0 = out7 ^ in3; + out1 = tmp0 ^ out0; + out5 = tmp3 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_1F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in6; + tmp1 = tmp0 ^ in5; + out7 = tmp1 ^ in3; + out0 = tmp1 ^ in0 ^ in7; + out6 = out7 ^ in2 ^ in6; + out1 = out0 ^ in1 ^ in4; + out4 = out0 ^ out6 ^ in1; + out3 = tmp0 ^ out4; + out2 = out4 ^ out7 ^ in7; + out5 = out3 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_20(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in4; + out0 = in3 ^ in7; + tmp0 = in3 ^ in4; + tmp1 = in6 ^ in7; + out2 = out0 ^ in5; + out4 = tmp0 ^ in5; + out3 = tmp0 ^ tmp1; + out7 = tmp1 ^ in2; + out6 = tmp1 ^ in1 ^ in5; + out5 = out2 ^ out3 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_21(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in1 ^ in4; + tmp0 = in4 ^ in6; + out4 = in3 ^ in5; + out7 = in2 ^ in6; + out0 = in0 ^ in3 ^ in7; + out6 = in1 ^ in5 ^ in7; + out3 = tmp0 ^ in7; + out5 = tmp0 ^ in0; + out2 = out4 ^ in2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_22(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3; + out1 = in0 ^ in4; + out7 = in2 ^ in7; + out4 = in4 ^ in5 ^ in7; + out5 = in0 ^ in5 ^ in6; + out6 = in1 ^ in6 ^ in7; + out3 = in2 ^ in3 ^ in4 ^ in6; + out2 = in1 ^ in3 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_23(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in2; + out0 = in0 ^ in3; + out4 = in5 ^ in7; + out5 = in0 ^ in6; + out6 = in1 ^ in7; + out3 = in2 ^ in4 ^ in6; + out1 = in0 ^ in1 ^ in4; + out2 = out4 ^ out6 ^ in2 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_24(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in4 ^ in7; + tmp0 = in3 ^ in4; + out0 = in3 ^ in6 ^ in7; + out3 = tmp0 ^ in1; + tmp1 = out0 ^ in5; + out6 = tmp1 ^ out3; + out2 = tmp1 ^ in0; + out7 = tmp1 ^ in2 ^ in3; + out5 = out2 ^ in4; + out4 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_25(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in4; + tmp0 = in2 ^ in5; + out1 = out3 ^ in7; + out7 = tmp0 ^ in6; + out6 = out1 ^ in5; + out4 = out7 ^ in3 ^ in7; + out2 = out4 ^ in0; + out0 = tmp0 ^ out2; + out5 = out0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_26(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3 ^ in6; + tmp0 = in4 ^ in7; + out7 = in2 ^ in5 ^ in7; + tmp1 = out0 ^ in0 ^ in5; + out1 = tmp0 ^ in0; + tmp2 = tmp0 ^ in6; + out2 = tmp1 ^ in1; + out5 = tmp1 ^ in7; + out6 = tmp2 ^ in1; + out4 = tmp2 ^ out7; + out3 = out0 ^ out6 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_27(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in2 ^ in5; + out0 = in0 ^ in3 ^ in6; + out6 = in1 ^ in4 ^ in7; + out4 = out7 ^ in6; + out2 = out0 ^ out7 ^ in1; + out5 = out0 ^ in7; + out1 = out6 ^ in0; + out3 = out6 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_28(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3; + out1 = in4 ^ in6; + out0 = in3 ^ in5 ^ in7; + tmp0 = out1 ^ in7; + tmp1 = out0 ^ in4; + out7 = tmp0 ^ in2; + tmp2 = tmp0 ^ in1; + out3 = tmp1 ^ in0; + out6 = tmp1 ^ tmp2; + out4 = tmp2 ^ in3; + out5 = out3 ^ in2 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_29(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in3; + tmp0 = in1 ^ in3; + tmp1 = in4 ^ in6; + tmp2 = in0 ^ in4 ^ in7; + out6 = tmp0 ^ in5; + out4 = tmp0 ^ in6 ^ in7; + out1 = tmp1 ^ in1; + out7 = tmp1 ^ in2; + out3 = tmp2 ^ in5; + out5 = tmp2 ^ in2; + out0 = out3 ^ in3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_2A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3 ^ in5; + tmp0 = in1 ^ in3; + tmp1 = in0 ^ in4; + out7 = in2 ^ in4 ^ in7; + out3 = tmp1 ^ out0 ^ in2; + out2 = tmp0 ^ in7; + out6 = tmp0 ^ in6; + out1 = tmp1 ^ in6; + out5 = tmp1 ^ out7 ^ in5; + out4 = out1 ^ in0 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_2B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in6; + out7 = in2 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in2 ^ in7; + out6 = in1 ^ in3; + out1 = out4 ^ in0 ^ in4; + out3 = tmp0 ^ out7; + out0 = tmp0 ^ in3; + out5 = tmp1 ^ in0; + out2 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_2C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in2 ^ in3 ^ in4; + tmp2 = tmp0 ^ in6; + out4 = tmp1 ^ in1; + out5 = tmp1 ^ in0 ^ in5; + tmp3 = tmp2 ^ in4; + out6 = tmp2 ^ out4; + out7 = tmp3 ^ in7; + out2 = tmp3 ^ out5; + out3 = out6 ^ in0; + out0 = tmp1 ^ out7; + out1 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_2D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out4 = tmp0 ^ in1; + tmp1 = tmp0 ^ in0; + out2 = tmp1 ^ in6; + out5 = tmp1 ^ in4; + tmp2 = out2 ^ in2; + tmp3 = tmp2 ^ in5; + out0 = tmp3 ^ in7; + out7 = tmp3 ^ out5; + out6 = out4 ^ out7 ^ in6; + out3 = tmp2 ^ out6; + out1 = out0 ^ out6 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_2E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in7; + out0 = in3 ^ in5 ^ in6; + tmp1 = tmp0 ^ in0; + tmp2 = tmp0 ^ in2; + out1 = tmp1 ^ in6; + out4 = tmp2 ^ in1; + out7 = tmp2 ^ in5; + out3 = out0 ^ out4 ^ in0; + out2 = out3 ^ out7 ^ in7; + out6 = tmp1 ^ out2; + out5 = tmp1 ^ out7 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_2F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in2 ^ in5; + out4 = in1 ^ in2 ^ in7; + out6 = in1 ^ in3 ^ in4; + out5 = tmp0 ^ in2; + tmp2 = tmp0 ^ in6; + out7 = tmp1 ^ in4; + out0 = tmp2 ^ in5; + out2 = tmp2 ^ out4; + out1 = tmp2 ^ out6 ^ in7; + out3 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_30(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in4 ^ in5; + tmp0 = in3 ^ in6; + tmp1 = in4 ^ in7; + out6 = in1 ^ in2 ^ in5; + out3 = tmp0 ^ in5; + out4 = tmp0 ^ in0; + out7 = tmp0 ^ in2; + out0 = tmp1 ^ in3; + out2 = tmp1 ^ out3; + out5 = tmp1 ^ in0 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_31(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in5 ^ in6; + tmp0 = in4 ^ in5; + tmp1 = in0 ^ in3 ^ in4; + tmp2 = out3 ^ in2; + out1 = tmp0 ^ in1; + out0 = tmp1 ^ in7; + out4 = tmp1 ^ in6; + out6 = tmp2 ^ in1; + out2 = tmp2 ^ out0 ^ in0; + out5 = out1 ^ in0 ^ in7; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_32(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3 ^ in4; + out7 = in2 ^ in3; + tmp0 = in5 ^ in6; + tmp1 = in0 ^ in7; + out6 = in1 ^ in2; + out1 = in0 ^ in4 ^ in5; + out2 = tmp0 ^ out0 ^ in1; + out3 = tmp0 ^ out7 ^ in7; + out4 = tmp1 ^ in6; + out5 = tmp1 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_33(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in0 ^ in4; + tmp2 = in1 ^ in5; + out6 = in1 ^ in2 ^ in6; + out7 = tmp0 ^ in7; + out0 = tmp1 ^ in3; + out1 = tmp1 ^ tmp2; + tmp3 = tmp2 ^ in7; + tmp4 = tmp2 ^ in4 ^ in6; + out5 = tmp3 ^ in0; + out3 = tmp3 ^ out6; + out4 = tmp4 ^ out5; + out2 = tmp0 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_34(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in4 ^ in5; + tmp2 = tmp0 ^ in1; + tmp3 = tmp0 ^ in6; + out1 = tmp1 ^ in7; + tmp4 = tmp1 ^ in2; + out5 = tmp2 ^ in0; + out3 = tmp2 ^ out1; + out0 = tmp3 ^ in7; + out7 = tmp3 ^ tmp4; + out6 = tmp4 ^ in1; + out2 = out3 ^ out5 ^ in3; + out4 = tmp4 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_35(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + tmp1 = in5 ^ in7; + out7 = tmp0 ^ tmp1 ^ in3; + out3 = tmp1 ^ in1; + out1 = out3 ^ in4; + tmp2 = out1 ^ in7; + out5 = tmp2 ^ in0 ^ in3; + out6 = tmp0 ^ tmp2; + out0 = out3 ^ out5 ^ in6; + out4 = tmp0 ^ out0; + out2 = out4 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_36(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in2; + tmp0 = in1 ^ in3; + out0 = in3 ^ in4 ^ in6; + out6 = in1 ^ in2 ^ in4; + out5 = tmp0 ^ in0; + tmp1 = out5 ^ in5; + out2 = tmp1 ^ in4; + out3 = tmp1 ^ out4; + out1 = tmp0 ^ out2 ^ in7; + out7 = out3 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_37(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in4; + tmp2 = tmp0 ^ in6; + out3 = tmp0 ^ in5; + out4 = tmp1 ^ in0; + out6 = tmp2 ^ in4; + out1 = out3 ^ out4 ^ in7; + tmp3 = out4 ^ in1 ^ in3; + out7 = tmp3 ^ out1; + out2 = tmp3 ^ in5; + out5 = tmp1 ^ out2; + out0 = tmp2 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_38(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in3; + tmp0 = in3 ^ in4; + tmp1 = in5 ^ in7; + tmp2 = out3 ^ in1; + out2 = tmp0 ^ in6; + out0 = tmp0 ^ tmp1; + out4 = tmp1 ^ tmp2; + out7 = out2 ^ in2; + out1 = out2 ^ in3 ^ in5; + out6 = out4 ^ in0 ^ in2; + out5 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_39(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0; + tmp0 = in1 ^ in5; + tmp1 = tmp0 ^ in4; + out1 = tmp1 ^ in6; + out5 = out1 ^ in0 ^ in2; + tmp2 = tmp0 ^ out5; + out2 = tmp2 ^ in0 ^ in3; + out7 = out2 ^ in7; + out6 = tmp1 ^ out7; + out4 = tmp2 ^ out6; + out0 = out4 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_3A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in0 ^ in2; + tmp2 = in3 ^ in4; + tmp3 = in1 ^ in6; + tmp4 = in3 ^ in7; + out4 = tmp0 ^ in5; + out5 = tmp1 ^ tmp3; + out3 = tmp1 ^ tmp4; + out0 = tmp2 ^ in5; + out7 = tmp2 ^ in2; + tmp5 = tmp3 ^ in4; + out2 = tmp4 ^ tmp5; + out1 = tmp5 ^ out4; + out6 = tmp0 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_3B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in6; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in3; + out3 = tmp1 ^ in0; + out6 = tmp1 ^ tmp2; + out2 = out6 ^ in4; + out7 = tmp0 ^ out2; + out0 = out3 ^ out7 ^ in5; + out5 = out0 ^ out2 ^ in7; + out1 = tmp2 ^ out0; + out4 = out1 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_3C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in2 ^ in7; + tmp2 = in1 ^ in6 ^ in7; + out2 = tmp0 ^ in4; + out3 = tmp0 ^ tmp2; + out4 = tmp1 ^ out3 ^ in5; + out5 = tmp2 ^ out2 ^ in2; + out1 = out4 ^ out5 ^ in6; + out0 = out1 ^ in3; + out7 = tmp1 ^ out0; + out6 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_3D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = tmp0 ^ in3; + out2 = tmp1 ^ in4; + tmp2 = out2 ^ in5; + out4 = tmp2 ^ in1 ^ in6; + out5 = out4 ^ in7; + out6 = out5 ^ in0; + out7 = out6 ^ in1; + out0 = tmp0 ^ out7; + out1 = tmp1 ^ out5; + out3 = tmp2 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_3E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = tmp0 ^ in4; + out0 = tmp1 ^ in6; + out7 = tmp1 ^ in2; + out6 = out7 ^ in1 ^ in5 ^ in7; + out2 = out6 ^ in0 ^ in2; + out4 = out0 ^ out6 ^ in0; + out5 = tmp0 ^ out4; + out3 = out5 ^ in7; + out1 = out3 ^ out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_3F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + out3 = tmp0 ^ in2 ^ in6; + tmp1 = out3 ^ in5 ^ in7; + out4 = tmp1 ^ in4; + out5 = tmp1 ^ in3; + out1 = out4 ^ in2; + out7 = out1 ^ out3 ^ in3; + out2 = tmp0 ^ out7 ^ in5; + tmp2 = out2 ^ in0; + out6 = tmp2 ^ in6; + out0 = tmp1 ^ tmp2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_40(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3 ^ in7; + tmp0 = in3 ^ in4; + tmp1 = in6 ^ in7; + out4 = tmp0 ^ in2; + out5 = tmp0 ^ in5; + out0 = tmp1 ^ in2; + out7 = tmp1 ^ in1 ^ in5; + out2 = out0 ^ in4; + out3 = out2 ^ out5 ^ in7; + out6 = out3 ^ out4 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_41(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in3; + tmp0 = in5 ^ in6; + tmp1 = in6 ^ in7; + out5 = in3 ^ in4; + out1 = in1 ^ in3 ^ in7; + out6 = in0 ^ in4 ^ in5; + out3 = tmp0 ^ in2; + out7 = tmp0 ^ in1; + out2 = tmp1 ^ in4; + out0 = tmp1 ^ in0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_42(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in6; + out5 = in3 ^ in5; + out1 = in0 ^ in3 ^ in7; + out7 = in1 ^ in5 ^ in7; + out4 = in2 ^ in4 ^ in7; + out6 = in0 ^ in4 ^ in6; + out2 = out0 ^ in1 ^ in4; + out3 = out5 ^ in6 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_43(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in3; + out7 = in1 ^ in5; + out4 = in2 ^ in7; + out6 = in0 ^ in4; + out0 = in0 ^ in2 ^ in6; + out3 = in5 ^ in6 ^ in7; + out2 = in1 ^ in4 ^ in6; + out1 = in0 ^ in1 ^ in3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_44(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3; + out0 = in2 ^ in7; + tmp0 = in4 ^ in7; + out7 = in1 ^ in6 ^ in7; + out6 = in0 ^ in5 ^ in6; + out4 = tmp0 ^ in3 ^ in6; + out3 = out0 ^ in1 ^ in3 ^ in5; + out2 = out0 ^ in0 ^ in4; + out5 = tmp0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_45(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in1 ^ in3; + out7 = in1 ^ in6; + out5 = in4 ^ in7; + out6 = in0 ^ in5; + out0 = in0 ^ in2 ^ in7; + out4 = in3 ^ in6 ^ in7; + out2 = out5 ^ in0; + out3 = out0 ^ out6 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_46(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2; + out1 = in0 ^ in3; + out7 = in1 ^ in7; + out4 = in4 ^ in6; + out5 = in5 ^ in7; + out6 = in0 ^ in6; + out3 = in1 ^ in3 ^ in5; + out2 = out4 ^ out6 ^ in1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_47(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in6; + out7 = in1; + out5 = in7; + out6 = in0; + tmp0 = in0 ^ in1; + out3 = in1 ^ in5; + out0 = in0 ^ in2; + out1 = tmp0 ^ in3; + out2 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_48(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out1 = in3 ^ in6 ^ in7; + out3 = tmp0 ^ in0; + out0 = tmp0 ^ out1 ^ in5; + tmp1 = out0 ^ in4; + out2 = tmp1 ^ in7; + out5 = tmp1 ^ in3; + out4 = out5 ^ in1; + out7 = tmp0 ^ out4; + out6 = tmp1 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_49(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in2; + tmp0 = in2 ^ in5; + out2 = in4 ^ in5 ^ in6; + tmp1 = tmp0 ^ out2 ^ in3; + out7 = out2 ^ in1; + out5 = tmp1 ^ in7; + out4 = out5 ^ out7 ^ in6; + out1 = tmp0 ^ out4; + out6 = out1 ^ out7 ^ in0; + out0 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_4A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + tmp1 = in3 ^ in7; + out0 = tmp0 ^ in5; + out3 = tmp1 ^ in0; + out5 = tmp1 ^ out0; + out4 = out0 ^ in1 ^ in4; + out1 = out3 ^ in6; + out2 = out4 ^ in7; + out6 = out1 ^ in4; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_4B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in7; + tmp0 = in1 ^ in5; + tmp1 = in2 ^ in6; + tmp2 = out3 ^ in3; + out7 = tmp0 ^ in4; + out4 = tmp0 ^ tmp1; + tmp3 = tmp1 ^ in0; + out6 = tmp2 ^ in4; + out5 = tmp2 ^ tmp3; + out1 = tmp2 ^ in1 ^ in6; + out2 = out7 ^ in6 ^ in7; + out0 = tmp3 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_4C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3 ^ in6; + tmp0 = in2 ^ in5; + tmp1 = out1 ^ in5 ^ in7; + out0 = tmp0 ^ in7; + tmp2 = tmp0 ^ in4; + out6 = tmp1 ^ in0; + out2 = tmp2 ^ in0; + out5 = tmp2 ^ in6; + out3 = tmp0 ^ out6 ^ in1; + out7 = out0 ^ out5 ^ in1; + out4 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_4D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in6; + out4 = in1 ^ in3 ^ in5; + tmp2 = tmp0 ^ in7; + out2 = tmp0 ^ in4; + out1 = tmp1 ^ in3; + out7 = tmp1 ^ in4; + out0 = tmp2 ^ in2; + out6 = tmp2 ^ in3; + out5 = out7 ^ in1 ^ in2; + out3 = tmp1 ^ out0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_4E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in5; + out7 = in1 ^ in4 ^ in7; + out1 = in0 ^ in3 ^ in6; + out5 = out0 ^ in6; + out4 = out7 ^ in5; + out3 = out1 ^ in1; + out6 = out1 ^ in7; + out2 = out4 ^ in0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_4F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in2 ^ in6; + out7 = in1 ^ in4; + out3 = in0 ^ in1 ^ in6; + out4 = in1 ^ in5 ^ in7; + out0 = in0 ^ in2 ^ in5; + out6 = in0 ^ in3 ^ in7; + out1 = out3 ^ in3; + out2 = out4 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_50(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in7; + tmp0 = in3 ^ in5; + out0 = out2 ^ in4 ^ in6; + out1 = tmp0 ^ in7; + tmp1 = tmp0 ^ in6; + out3 = out0 ^ in3; + out7 = tmp1 ^ in1; + tmp2 = tmp1 ^ in0; + out5 = out3 ^ in1 ^ in2; + out4 = tmp2 ^ in2; + out6 = tmp2 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_51(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in7; + out3 = in2 ^ in4 ^ in6 ^ in7; + out0 = out3 ^ in0; + out6 = out0 ^ in5; + out4 = out6 ^ in3 ^ in7; + out1 = out0 ^ out4 ^ in1; + out7 = out1 ^ in6; + out5 = out7 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_52(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in2; + tmp0 = in2 ^ in4; + tmp1 = in3 ^ in5; + tmp2 = in3 ^ in6; + tmp3 = in0 ^ in7; + out0 = tmp0 ^ in6; + out6 = tmp0 ^ tmp3; + out7 = tmp1 ^ in1; + out1 = tmp1 ^ tmp3; + out3 = tmp2 ^ in4; + out5 = tmp2 ^ in1 ^ in7; + out4 = tmp2 ^ out1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_53(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1; + out3 = in4 ^ in6; + out0 = out3 ^ in0 ^ in2; + out6 = out0 ^ in7; + out4 = out6 ^ in5; + out7 = out0 ^ out4 ^ in1 ^ in3; + out1 = out7 ^ in0; + out5 = out7 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_54(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3 ^ in5; + tmp0 = in1 ^ in3; + tmp1 = in2 ^ in4; + tmp2 = in0 ^ in7; + out5 = in1 ^ in4 ^ in6; + out4 = tmp2 ^ out1; + out7 = tmp0 ^ in6; + out3 = tmp0 ^ tmp1; + out0 = tmp1 ^ in7; + tmp3 = tmp2 ^ in2; + out2 = tmp3 ^ in6; + out6 = tmp3 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_55(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in1 ^ in4; + tmp2 = in6 ^ in7; + out7 = tmp0 ^ tmp2; + out1 = tmp0 ^ in5; + out3 = tmp1 ^ in2; + out5 = tmp1 ^ in5 ^ in6; + out2 = tmp2 ^ in0; + out4 = out5 ^ out7 ^ in0; + out6 = out2 ^ in2 ^ in5; + out0 = out5 ^ out6 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_56(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in4; + tmp0 = in0 ^ in2; + out4 = in0 ^ in5; + out7 = in1 ^ in3; + out5 = in1 ^ in6; + out6 = tmp0 ^ in7; + out2 = tmp0 ^ out5; + out1 = out4 ^ in3; + out3 = out7 ^ in4 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_57(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in7; + out0 = in0 ^ in2 ^ in4; + out5 = in1 ^ in5 ^ in6; + out4 = tmp0 ^ in4; + out1 = tmp0 ^ in1 ^ in3; + out2 = tmp0 ^ out5; + out3 = tmp1 ^ in4; + out7 = tmp1 ^ in3; + out6 = tmp1 ^ out2 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_58(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in5; + tmp0 = in2 ^ in3 ^ in4; + out5 = tmp0 ^ in1; + out6 = tmp0 ^ in0 ^ in5; + out3 = out6 ^ in7; + tmp1 = out2 ^ out5; + out7 = tmp1 ^ in6; + out4 = tmp1 ^ out3 ^ in3; + out0 = out4 ^ out7 ^ in0; + out1 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_59(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in5; + tmp0 = in0 ^ in5 ^ in7; + out3 = tmp0 ^ in2 ^ in4; + out0 = out3 ^ in6; + tmp1 = out0 ^ in7; + out6 = tmp1 ^ in3; + out5 = out6 ^ in0 ^ in1 ^ in6; + out4 = tmp0 ^ out5; + out1 = tmp1 ^ out4; + out7 = out1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_5A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in5; + out5 = tmp0 ^ in3; + out4 = tmp0 ^ in0; + tmp2 = tmp1 ^ in4; + out2 = tmp1 ^ in1 ^ in7; + out7 = tmp2 ^ out5; + out6 = out4 ^ out7 ^ in5; + out0 = tmp2 ^ in6; + out1 = out0 ^ out6 ^ in7; + out3 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_5B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in0 ^ in4; + tmp2 = in1 ^ in5; + out5 = tmp0 ^ tmp2; + tmp3 = tmp1 ^ in6; + out3 = tmp1 ^ in5; + out2 = tmp2 ^ in7; + tmp4 = out3 ^ in2; + out7 = out2 ^ in3 ^ in4; + out0 = tmp4 ^ in6; + out6 = tmp0 ^ tmp3; + out4 = tmp2 ^ tmp4; + out1 = tmp3 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_5C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in6; + tmp1 = in0 ^ in2 ^ in5; + out1 = tmp0 ^ in5; + tmp2 = tmp0 ^ in1; + out2 = tmp1 ^ in6; + out6 = tmp1 ^ in3; + out4 = tmp2 ^ in0; + out7 = tmp2 ^ in4; + out3 = tmp1 ^ out7; + out0 = out3 ^ out4 ^ in7; + out5 = out0 ^ in1 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_5D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in0 ^ in6; + out2 = tmp1 ^ in5; + tmp2 = out2 ^ in3; + out6 = tmp2 ^ in2; + out1 = tmp0 ^ tmp2; + tmp3 = out1 ^ in4 ^ in5; + out4 = tmp3 ^ in0; + out7 = tmp3 ^ in7; + tmp4 = out4 ^ out6; + out5 = tmp4 ^ in7; + out0 = tmp0 ^ out5; + out3 = tmp1 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_5E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in3 ^ in5; + tmp2 = in1 ^ in7; + out7 = in1 ^ in3 ^ in4; + out0 = tmp0 ^ in4; + tmp3 = tmp1 ^ in0; + out5 = tmp2 ^ in2; + out1 = tmp3 ^ in6; + out6 = tmp0 ^ tmp3; + tmp4 = tmp2 ^ out1; + out3 = tmp4 ^ in4; + out4 = tmp1 ^ tmp4; + out2 = tmp0 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_5F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in5; + tmp1 = in0 ^ in6; + tmp2 = tmp0 ^ in7; + tmp3 = tmp1 ^ in3; + out2 = tmp1 ^ tmp2; + out5 = tmp2 ^ in2; + out6 = tmp3 ^ in2; + out3 = out2 ^ in4; + out4 = out3 ^ in5; + out1 = tmp0 ^ tmp3; + out7 = tmp3 ^ out4; + out0 = out4 ^ out5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_60(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in5; + tmp0 = in3 ^ in6; + out1 = in3 ^ in4 ^ in7; + out7 = out4 ^ in1; + tmp1 = out4 ^ in4; + out0 = tmp0 ^ in2; + out5 = tmp0 ^ in0; + out2 = tmp0 ^ tmp1; + out3 = tmp1 ^ in7; + out6 = out3 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_61(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + out4 = tmp0 ^ in4; + tmp1 = out4 ^ in3; + out3 = tmp1 ^ in7; + out2 = tmp1 ^ in2 ^ in6; + out1 = tmp0 ^ out3 ^ in1; + out0 = out2 ^ out4 ^ in0; + out7 = tmp1 ^ out1; + out6 = out0 ^ out1 ^ in2; + out5 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_62(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in4 ^ in5; + tmp0 = in0 ^ in3 ^ in4; + out1 = tmp0 ^ in7; + out5 = tmp0 ^ in6; + tmp1 = out1 ^ in0; + tmp2 = tmp1 ^ out3; + out4 = tmp2 ^ in2; + tmp3 = tmp2 ^ in1; + out0 = out4 ^ in5 ^ in6; + out7 = tmp3 ^ out0; + out6 = tmp0 ^ tmp3; + out2 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_63(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in1 ^ in7; + out3 = tmp0 ^ in5; + tmp2 = out3 ^ in6; + out4 = out3 ^ in2 ^ in7; + out5 = tmp2 ^ in0; + tmp3 = out5 ^ in3; + out0 = tmp3 ^ out4; + out2 = tmp1 ^ tmp2; + out6 = tmp1 ^ tmp3; + tmp4 = tmp0 ^ out2; + out1 = tmp4 ^ out5; + out7 = tmp4 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_64(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in3; + out1 = in3 ^ in4; + out7 = in1 ^ in2; + tmp0 = in4 ^ in5; + tmp1 = in0 ^ in7; + out4 = in5 ^ in6 ^ in7; + out2 = tmp0 ^ out0 ^ in0; + out3 = tmp0 ^ out7 ^ in6; + out5 = tmp1 ^ in6; + out6 = tmp1 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_65(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in4 ^ in5; + tmp2 = in6 ^ in7; + out7 = in1 ^ in2 ^ in7; + out1 = in1 ^ in3 ^ in4; + out0 = tmp0 ^ in2; + out2 = tmp0 ^ tmp1; + out4 = tmp1 ^ tmp2; + tmp3 = tmp2 ^ in0; + out3 = out4 ^ out7 ^ in3; + out5 = tmp3 ^ in5; + out6 = tmp3 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_66(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in3; + tmp2 = in0 ^ in4; + out7 = tmp0 ^ in6; + out0 = tmp1 ^ in7; + out1 = tmp2 ^ in3; + tmp3 = tmp2 ^ in6; + tmp4 = out1 ^ in5; + out5 = tmp3 ^ in7; + out4 = tmp3 ^ tmp4; + out2 = tmp0 ^ tmp4 ^ in7; + out6 = tmp1 ^ out2 ^ in4; + out3 = tmp3 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_67(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = tmp0 ^ in1; + tmp2 = tmp0 ^ in7; + out1 = tmp1 ^ in4; + out0 = tmp2 ^ in2; + tmp3 = out1 ^ in7; + out2 = tmp3 ^ in5; + out3 = out2 ^ in0 ^ in6; + out7 = tmp1 ^ out0 ^ in6; + out5 = tmp1 ^ out3; + out4 = tmp2 ^ out5; + out6 = tmp3 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_68(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in2 ^ in3 ^ in5; + tmp2 = tmp0 ^ in1; + tmp3 = tmp0 ^ in6; + out0 = tmp1 ^ in6; + out6 = tmp2 ^ in0; + out7 = tmp1 ^ tmp2; + out1 = tmp3 ^ in7; + out2 = out1 ^ in2; + out4 = tmp2 ^ out2; + out3 = out4 ^ out6 ^ in3; + out5 = tmp3 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_69(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in6 ^ in7; + out2 = tmp0 ^ in3 ^ in4; + out1 = out2 ^ in1; + out3 = out2 ^ in0 ^ in2; + out4 = out1 ^ in2 ^ in3; + out6 = out1 ^ in0 ^ in7; + out7 = out4 ^ in5 ^ in6; + out5 = out4 ^ out6 ^ in5; + out0 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_6A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + out3 = in0 ^ in4 ^ in6; + tmp1 = tmp0 ^ in3; + out4 = tmp1 ^ in1; + tmp2 = tmp1 ^ in7; + out2 = out4 ^ in4; + out0 = tmp2 ^ in5; + out5 = tmp2 ^ out3; + out7 = out2 ^ in3 ^ in5; + out1 = tmp0 ^ out5; + out6 = tmp1 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_6B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in6; + out2 = tmp0 ^ in1 ^ in3; + out4 = out2 ^ in2; + tmp1 = out2 ^ in0; + out7 = out4 ^ in3 ^ in5 ^ in7; + out1 = tmp1 ^ in7; + out3 = tmp1 ^ in1; + out6 = tmp1 ^ in5; + out0 = tmp1 ^ out7 ^ in6; + out5 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_6C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1; + tmp0 = in2 ^ in3; + out5 = in0 ^ in2; + out1 = in3 ^ in4 ^ in6; + tmp1 = out5 ^ in1; + out0 = tmp0 ^ in5; + out6 = tmp0 ^ tmp1; + out3 = tmp1 ^ in4; + out7 = out3 ^ in0; + out2 = out6 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_6D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in4; + tmp0 = in0 ^ in2; + tmp1 = out4 ^ in3; + out7 = out4 ^ in2 ^ in7; + out5 = tmp0 ^ in5; + out3 = tmp0 ^ tmp1; + out1 = tmp1 ^ in6; + out0 = out5 ^ in3; + out2 = out3 ^ out7 ^ in4; + out6 = out1 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_6E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in0 ^ in4; + out4 = tmp0 ^ in7; + out6 = tmp0 ^ in0 ^ in5; + out5 = tmp1 ^ in2; + tmp2 = tmp1 ^ in3; + out3 = tmp2 ^ out4; + out1 = tmp2 ^ in6; + out2 = tmp0 ^ out5; + out0 = out2 ^ out3 ^ in5; + out7 = out1 ^ out2 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_6F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in7; + tmp1 = tmp0 ^ in4; + tmp2 = tmp0 ^ in0 ^ in2; + out4 = tmp1 ^ in1; + out0 = tmp2 ^ in5; + out3 = out4 ^ in0; + out2 = out3 ^ in7; + out1 = out2 ^ in6; + out6 = out1 ^ in4 ^ in5; + out7 = tmp2 ^ out1; + out5 = tmp1 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_70(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2; + tmp0 = in2 ^ in4; + out2 = in2 ^ in3 ^ in5; + tmp1 = tmp0 ^ in6; + tmp2 = out2 ^ in7; + out0 = tmp1 ^ in3; + out4 = tmp1 ^ in0; + out7 = tmp2 ^ in1; + out6 = out4 ^ in1; + out5 = out7 ^ in0 ^ in2; + out1 = tmp0 ^ tmp2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_71(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in5; + out3 = in2 ^ in3; + tmp0 = in0 ^ in2; + tmp1 = out2 ^ in1; + out4 = tmp0 ^ in6; + tmp2 = tmp0 ^ in1; + out7 = tmp1 ^ in2; + out1 = tmp1 ^ in4 ^ in7; + out0 = out4 ^ in3 ^ in4; + out6 = tmp2 ^ in4; + out5 = tmp2 ^ out3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_72(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in7; + tmp0 = in0 ^ in4; + tmp1 = tmp0 ^ in3 ^ in7; + out1 = tmp1 ^ in5; + out5 = out1 ^ in1; + tmp2 = tmp0 ^ out5; + out2 = tmp2 ^ in2; + out7 = out2 ^ in6; + out6 = tmp1 ^ out7; + out4 = tmp2 ^ out6; + out0 = out4 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_73(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in7; + out2 = out3 ^ in1 ^ in5; + out1 = out2 ^ in0 ^ in4; + out5 = out1 ^ in5; + out6 = out1 ^ out3 ^ in2; + out0 = out2 ^ out6 ^ in6; + out7 = out0 ^ out1 ^ in3; + out4 = out0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_74(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in1 ^ in2 ^ in6; + out4 = in0 ^ in4 ^ in7; + out5 = in0 ^ in1 ^ in5; + out0 = tmp0 ^ in2; + out1 = tmp0 ^ in5; + out3 = tmp1 ^ in7; + out6 = tmp1 ^ in0; + out2 = tmp1 ^ out5 ^ in3; + out7 = out3 ^ in3 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_75(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in7; + tmp0 = in1 ^ in3; + out5 = in0 ^ in1; + out7 = tmp0 ^ in2; + tmp1 = tmp0 ^ in4; + out6 = out5 ^ in2; + tmp2 = out7 ^ in6; + out1 = tmp1 ^ in5; + out0 = tmp1 ^ out6; + out3 = tmp2 ^ in7; + out2 = tmp2 ^ out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_76(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in6; + tmp0 = in0 ^ in5; + tmp1 = in3 ^ in7; + tmp2 = tmp0 ^ in4; + tmp3 = tmp1 ^ in2; + out5 = tmp2 ^ in1; + out1 = tmp2 ^ in3; + out0 = tmp3 ^ in4; + out4 = out1 ^ in5; + out7 = tmp3 ^ out3; + out2 = tmp0 ^ out7; + out6 = tmp1 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_77(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in3; + tmp0 = in1 ^ in4; + tmp1 = in1 ^ in6; + tmp2 = out4 ^ in5; + out5 = tmp0 ^ in0; + out1 = tmp0 ^ tmp2; + out3 = tmp1 ^ in3; + out2 = tmp1 ^ tmp2 ^ in7; + out7 = out3 ^ in2; + tmp3 = out7 ^ in6; + out6 = tmp2 ^ tmp3; + out0 = tmp3 ^ out5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_78(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in2 ^ in7; + tmp2 = in0 ^ in5 ^ in6; + out2 = tmp1 ^ in3; + out3 = tmp2 ^ in2; + out5 = out3 ^ in1 ^ in3; + out0 = tmp0 ^ out3 ^ in4; + out1 = tmp1 ^ out0; + out4 = out1 ^ out5 ^ in5; + out7 = tmp0 ^ out4; + out6 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_79(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in7; + tmp0 = in3 ^ in4; + tmp1 = in1 ^ in5; + tmp2 = tmp1 ^ in2; + out4 = tmp2 ^ in0 ^ in7; + tmp3 = out4 ^ in5; + out5 = tmp3 ^ out2 ^ in6; + out7 = tmp0 ^ tmp2; + out6 = tmp0 ^ tmp3; + out3 = tmp1 ^ out5; + out0 = out3 ^ in4; + out1 = tmp3 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_7A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + out2 = tmp0 ^ in3; + tmp1 = out2 ^ in4; + out4 = tmp1 ^ in0 ^ in5; + out5 = out4 ^ in6; + out6 = out5 ^ in7; + out7 = out6 ^ in0; + out0 = out7 ^ in1; + out1 = tmp0 ^ out6; + out3 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_7B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in3; + tmp0 = in0 ^ in5; + out4 = tmp0 ^ out2 ^ in2; + tmp1 = out4 ^ in4; + out6 = tmp1 ^ in7; + out5 = tmp1 ^ in5 ^ in6; + out0 = out6 ^ in1 ^ in6; + tmp2 = out0 ^ in2; + out1 = tmp2 ^ in1; + out3 = tmp2 ^ in4; + out7 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_7C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = tmp0 ^ in4; + out0 = tmp1 ^ in2; + out1 = tmp1 ^ in6; + out7 = out0 ^ in1 ^ in5 ^ in7; + out5 = out1 ^ out7 ^ in0; + out3 = out5 ^ in6; + out6 = tmp0 ^ out5; + out2 = out6 ^ in1; + out4 = out2 ^ out7 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_7D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = tmp0 ^ in3; + tmp2 = tmp0 ^ in6; + out7 = tmp1 ^ in4; + tmp3 = tmp2 ^ in0; + out5 = tmp3 ^ in7; + out4 = tmp3 ^ in2 ^ in5; + out2 = tmp1 ^ out5; + out6 = tmp2 ^ out2; + out0 = out4 ^ out7 ^ in6; + out1 = tmp3 ^ out0; + out3 = out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_7E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in0 ^ in5; + out1 = tmp0 ^ tmp1 ^ in6; + out3 = tmp1 ^ in1; + out4 = out1 ^ in1 ^ in7; + tmp2 = out4 ^ in3; + out5 = tmp2 ^ in2; + out6 = tmp0 ^ out5; + out7 = tmp1 ^ out4 ^ in2; + out2 = out6 ^ in5 ^ in7; + out0 = tmp2 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_7F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in7; + tmp1 = tmp0 ^ in3 ^ in5; + tmp2 = tmp1 ^ in0; + out0 = tmp2 ^ in4; + out6 = tmp2 ^ in1; + out3 = tmp0 ^ out6; + tmp3 = out3 ^ in6; + out1 = tmp3 ^ in4; + out2 = tmp3 ^ in5; + out4 = tmp3 ^ in7; + out5 = tmp1 ^ out1; + out7 = out0 ^ out4 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_80(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in4 ^ in5; + out1 = in2 ^ in6 ^ in7; + out5 = tmp0 ^ in4; + tmp2 = tmp0 ^ in1; + out6 = tmp1 ^ in3; + out7 = tmp1 ^ in0 ^ in6; + out4 = tmp2 ^ in7; + out3 = tmp2 ^ out6; + out2 = out3 ^ out5 ^ in6; + out0 = out2 ^ in3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_81(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in6; + tmp1 = tmp0 ^ in3; + out6 = tmp1 ^ in5; + out5 = out6 ^ in2 ^ in6; + out3 = out5 ^ in1; + out2 = tmp0 ^ out3; + out1 = out3 ^ out6 ^ in7; + out4 = tmp1 ^ out1; + out7 = out2 ^ out4 ^ in0; + out0 = out7 ^ in1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_82(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in2; + tmp0 = in6 ^ in7; + out5 = in2 ^ in3; + out6 = in3 ^ in4; + out7 = in0 ^ in4 ^ in5; + out0 = in1 ^ in5 ^ in6; + out1 = tmp0 ^ in0 ^ in2; + out2 = tmp0 ^ in3 ^ in5; + out3 = tmp0 ^ out0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_83(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in5; + tmp2 = in3 ^ in6; + out4 = in1 ^ in2 ^ in4; + out0 = tmp0 ^ in5 ^ in6; + out5 = tmp1 ^ in3; + tmp3 = tmp1 ^ in7; + out6 = tmp2 ^ in4; + out2 = tmp2 ^ tmp3; + tmp4 = tmp3 ^ out4; + out1 = tmp3 ^ out0; + out3 = tmp4 ^ in3; + out7 = tmp0 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_84(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2 ^ in6; + out6 = in3 ^ in5; + out0 = in1 ^ in5 ^ in7; + out7 = in0 ^ in4 ^ in6; + out4 = in1 ^ in3 ^ in6; + out5 = in2 ^ in4 ^ in7; + out2 = out6 ^ in0 ^ in1; + out3 = out5 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_85(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in6; + tmp1 = in3 ^ in6; + tmp2 = tmp0 ^ in4; + out1 = tmp0 ^ in2; + out6 = tmp1 ^ in5; + out4 = tmp2 ^ in3; + tmp3 = out1 ^ out6; + out2 = tmp3 ^ in0; + out3 = tmp2 ^ tmp3 ^ in7; + out7 = out2 ^ out3 ^ in1; + out5 = tmp1 ^ out3; + out0 = tmp2 ^ out7 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_86(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in3; + out7 = in0 ^ in4; + out0 = in1 ^ in5; + out5 = in2 ^ in7; + out3 = in4 ^ in5 ^ in6; + out1 = in0 ^ in2 ^ in6; + out4 = in1 ^ in6 ^ in7; + out2 = in0 ^ in3 ^ in5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_87(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in3 ^ in6; + tmp0 = in0 ^ in1; + out7 = in0 ^ in4 ^ in7; + out5 = in2 ^ in5 ^ in7; + out3 = out6 ^ in4 ^ in5; + out0 = tmp0 ^ in5; + tmp1 = tmp0 ^ in6; + out2 = out5 ^ in0 ^ in3; + out1 = tmp1 ^ in2; + out4 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_88(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2 ^ in7; + tmp0 = in5 ^ in6; + out0 = in1 ^ in6 ^ in7; + out6 = in4 ^ in5 ^ in7; + out3 = out0 ^ out1 ^ in0 ^ in4; + out7 = tmp0 ^ in0; + tmp1 = tmp0 ^ in3; + out2 = out0 ^ in3; + out4 = tmp1 ^ in2; + out5 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_89(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in7; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in6; + out1 = tmp1 ^ in1; + out7 = tmp2 ^ in5; + out0 = tmp2 ^ in1; + out2 = out1 ^ in3 ^ in6; + out6 = out7 ^ in0 ^ in4; + out5 = out6 ^ in3; + out3 = tmp0 ^ out2 ^ in4; + out4 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_8A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in6; + out7 = in0 ^ in5; + out2 = in3 ^ in6; + out6 = in4 ^ in7; + out1 = in0 ^ in2 ^ in7; + out3 = out0 ^ out6 ^ in0; + out4 = out1 ^ out7 ^ in6; + out5 = out2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_8B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in3 ^ in6; + tmp2 = in5 ^ in7; + tmp3 = tmp0 ^ in7; + out0 = tmp0 ^ in6; + out2 = tmp1 ^ in2; + out5 = tmp1 ^ tmp2; + out7 = tmp2 ^ in0; + tmp4 = tmp3 ^ in4; + out1 = tmp3 ^ in2; + out6 = tmp4 ^ out0; + out4 = out6 ^ in2 ^ in5; + out3 = tmp1 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_8C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2; + out0 = in1 ^ in7; + out7 = in0 ^ in6; + out5 = in4 ^ in6; + out6 = in5 ^ in7; + out2 = out0 ^ in0 ^ in3; + out3 = out5 ^ out7 ^ in2 ^ in7; + out4 = out6 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_8D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in1 ^ in2; + tmp0 = in6 ^ in7; + out0 = in0 ^ in1 ^ in7; + out5 = in4 ^ in5 ^ in6; + out6 = tmp0 ^ in5; + out7 = tmp0 ^ in0; + out4 = tmp0 ^ out5 ^ in3; + out2 = out0 ^ in2 ^ in3; + out3 = out2 ^ in1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_8E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1; + out4 = in5; + out7 = in0; + out5 = in6; + out6 = in7; + out3 = in0 ^ in4; + out1 = in0 ^ in2; + out2 = in0 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_8F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in1; + tmp0 = in0 ^ in3; + out4 = in4 ^ in5; + out7 = in0 ^ in7; + out5 = in5 ^ in6; + out6 = in6 ^ in7; + out1 = out0 ^ in2; + out2 = tmp0 ^ in2; + out3 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_90(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in6 ^ in7; + out3 = tmp0 ^ in7; + out1 = tmp1 ^ in5; + tmp2 = out1 ^ in4; + out6 = tmp2 ^ in3; + out5 = out6 ^ in1; + out4 = out5 ^ in0; + out0 = tmp0 ^ tmp2; + out7 = tmp0 ^ out4; + out2 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_91(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = tmp0 ^ in3 ^ in5; + out2 = tmp1 ^ in1; + out6 = tmp1 ^ in7; + tmp2 = out2 ^ in5 ^ in7; + out3 = tmp2 ^ in4; + out5 = tmp2 ^ in6; + out1 = tmp1 ^ out5 ^ in2; + tmp3 = out1 ^ in0; + out4 = tmp3 ^ in3; + out0 = tmp0 ^ tmp3; + out7 = tmp2 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_92(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1; + tmp0 = in4 ^ in5; + tmp1 = tmp0 ^ in1; + out2 = tmp0 ^ in3 ^ in7; + out0 = tmp1 ^ in6; + out7 = out2 ^ in0; + out4 = out0 ^ in0 ^ in2; + out5 = out4 ^ out7 ^ in5; + out6 = tmp1 ^ out5; + out1 = out6 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_93(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in3; + tmp0 = in2 ^ in7; + tmp1 = out3 ^ in6; + tmp2 = tmp0 ^ in4; + out5 = tmp0 ^ tmp1; + out6 = tmp2 ^ in3; + out2 = out6 ^ in5; + out0 = out2 ^ out5 ^ in0; + out7 = tmp1 ^ out0; + out1 = tmp2 ^ out0; + out4 = out1 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_94(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in6; + tmp0 = in1 ^ in4 ^ in5; + out1 = out3 ^ in5; + out5 = tmp0 ^ out3; + out0 = tmp0 ^ in7; + out4 = tmp0 ^ in0 ^ in3; + out6 = out1 ^ in3 ^ in7; + out2 = out4 ^ in6; + out7 = out0 ^ out2 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_95(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out3 = tmp0 ^ in6; + tmp1 = tmp0 ^ in7; + tmp2 = out3 ^ in0; + out6 = tmp1 ^ in5; + tmp3 = tmp2 ^ in4; + out7 = tmp3 ^ in2; + tmp4 = tmp3 ^ in5; + out2 = tmp4 ^ in1; + tmp5 = out2 ^ in6; + out0 = tmp1 ^ tmp5; + out1 = tmp5 ^ out7; + out4 = tmp2 ^ out1; + out5 = tmp4 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_96(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in6 ^ in7; + tmp0 = in1 ^ in5; + tmp1 = in5 ^ in6; + out6 = out3 ^ in2 ^ in3; + out0 = tmp0 ^ in4; + tmp2 = tmp1 ^ in2; + out4 = out0 ^ in0 ^ in7; + out1 = tmp2 ^ in0; + out5 = tmp2 ^ in1; + out7 = tmp0 ^ out4 ^ in3; + out2 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_97(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + tmp1 = in2 ^ in6; + out3 = in3 ^ in6 ^ in7; + out7 = tmp0 ^ in3; + tmp2 = tmp0 ^ in5; + out5 = tmp1 ^ in1; + out6 = tmp1 ^ out3; + out0 = tmp2 ^ in1; + out2 = tmp2 ^ out3 ^ in2; + tmp3 = out0 ^ in4; + out4 = tmp3 ^ in7; + out1 = tmp1 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_98(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in7; + tmp1 = in1 ^ in4 ^ in7; + out1 = tmp0 ^ in2; + out0 = tmp1 ^ in6; + out2 = tmp1 ^ in3; + out6 = out0 ^ out1 ^ in1; + out5 = tmp0 ^ out2; + out3 = tmp1 ^ out6 ^ in0; + out7 = out0 ^ out5 ^ in0; + out4 = out6 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_99(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + out5 = in1 ^ in3 ^ in4; + out6 = in2 ^ in4 ^ in5; + out4 = tmp0 ^ in2; + tmp1 = tmp0 ^ in6; + tmp2 = out5 ^ in7; + out7 = tmp1 ^ in5; + out0 = tmp1 ^ tmp2; + out2 = tmp2 ^ in2; + out3 = out0 ^ out6 ^ in3; + out1 = tmp1 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_9A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in6; + out5 = in1 ^ in3 ^ in5; + tmp2 = tmp0 ^ in7; + out3 = tmp0 ^ tmp1; + out0 = tmp1 ^ in4; + out7 = tmp2 ^ in3; + out1 = tmp2 ^ in2; + out6 = out0 ^ in1 ^ in2; + out4 = out1 ^ in4 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_9B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in1 ^ in3; + tmp0 = in3 ^ in5; + out6 = in2 ^ in4; + out4 = in0 ^ in2 ^ in7; + out7 = tmp0 ^ in0; + out2 = out6 ^ in3; + out1 = out4 ^ in1 ^ in5; + out3 = out7 ^ in1 ^ in6; + out0 = tmp0 ^ out3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_9C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2 ^ in5; + tmp0 = in0 ^ in3 ^ in6; + out3 = out1 ^ in0; + out6 = out1 ^ in6; + out7 = tmp0 ^ in7; + out4 = out7 ^ in4; + out2 = out4 ^ in1; + out0 = tmp0 ^ out2; + out5 = out0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_9D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in2 ^ in5; + tmp0 = in0 ^ in3; + out5 = in1 ^ in4 ^ in7; + out1 = out6 ^ in1; + out3 = tmp0 ^ out6; + out7 = tmp0 ^ in6; + out0 = out5 ^ in0; + out4 = out7 ^ in7; + out2 = out5 ^ out7 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_9E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in4; + tmp0 = in0 ^ in5; + out6 = in2 ^ in6; + out7 = in0 ^ in3 ^ in7; + out4 = in0 ^ in4 ^ in6; + out5 = in1 ^ in5 ^ in7; + out1 = tmp0 ^ in2; + out3 = tmp0 ^ in7; + out2 = out4 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_9F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in2; + out7 = in0 ^ in3; + tmp0 = in0 ^ in1; + out4 = in0 ^ in6; + out5 = in1 ^ in7; + out1 = tmp0 ^ in2 ^ in5; + out2 = out7 ^ in2 ^ in4 ^ in6; + out3 = out7 ^ in5 ^ in7; + out0 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in6; + out2 = tmp0 ^ in7; + tmp1 = tmp0 ^ in5; + out6 = out2 ^ in3 ^ in4; + out0 = tmp1 ^ in3; + tmp2 = out0 ^ in2; + out3 = tmp2 ^ in7; + tmp3 = tmp2 ^ in1; + out5 = tmp3 ^ in0; + out4 = tmp3 ^ out6; + out7 = out5 ^ out6 ^ in1; + out1 = tmp1 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = tmp0 ^ in1; + tmp2 = tmp0 ^ in4; + out4 = tmp1 ^ in7; + out7 = tmp2 ^ in0; + out6 = tmp2 ^ out4 ^ in3; + out3 = out4 ^ in6; + out2 = out3 ^ in5; + out1 = out2 ^ in4; + out5 = out1 ^ out6 ^ in0; + out0 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in6; + tmp0 = in1 ^ in3 ^ in5; + out3 = tmp0 ^ in6; + out4 = tmp0 ^ in2 ^ in4; + out0 = out3 ^ in7; + out6 = out0 ^ in4; + out1 = out0 ^ out4 ^ in0; + out7 = out1 ^ in5; + out5 = out7 ^ in3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in6; + out3 = in1 ^ in5 ^ in6; + tmp0 = out2 ^ in0; + out4 = out2 ^ out3 ^ in3; + tmp1 = tmp0 ^ in4; + out0 = tmp0 ^ out4 ^ in7; + out5 = tmp1 ^ in3; + out7 = tmp1 ^ in5; + out1 = tmp1 ^ in1 ^ in7; + out6 = tmp1 ^ out0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in2 ^ in4; + tmp2 = in2 ^ in5; + tmp3 = in0 ^ in7; + out0 = tmp0 ^ in5; + out6 = tmp0 ^ in6 ^ in7; + out1 = tmp1 ^ in6; + out7 = tmp1 ^ tmp3; + out3 = tmp2 ^ in3; + tmp4 = tmp2 ^ out1; + out2 = tmp3 ^ in1; + out5 = tmp4 ^ out7; + out4 = tmp4 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in5; + tmp0 = in1 ^ in6; + tmp1 = in0 ^ in1; + tmp2 = in2 ^ in4; + out6 = in1 ^ in3 ^ in7; + out4 = tmp0 ^ in5; + out1 = tmp0 ^ tmp2; + out0 = tmp1 ^ in3 ^ in5; + out2 = tmp1 ^ in2 ^ in7; + out7 = tmp2 ^ in0; + out5 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0; + out3 = in3 ^ in5 ^ in7; + out1 = in0 ^ in2 ^ in4 ^ in6; + out0 = out3 ^ in1; + out7 = out1 ^ in7; + out6 = out0 ^ in6; + out5 = out7 ^ in5; + out4 = out6 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in2; + out3 = in5 ^ in7; + out7 = out2 ^ in4 ^ in6; + out6 = out3 ^ in1 ^ in3; + out1 = out7 ^ in1; + out5 = out7 ^ in7; + out0 = out6 ^ in0; + out4 = out6 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = in1 ^ in6; + tmp2 = in0 ^ in2 ^ in7; + out1 = tmp0 ^ in7; + out4 = tmp0 ^ in6; + out0 = tmp1 ^ in3; + out2 = tmp1 ^ in5; + out6 = tmp1 ^ in4; + out7 = tmp2 ^ in5; + out3 = tmp2 ^ out0 ^ in6; + out5 = out7 ^ in2 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_A9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in6; + out6 = in1 ^ in4; + out7 = in0 ^ in2 ^ in5; + out5 = in0 ^ in3 ^ in7; + out2 = out4 ^ in1 ^ in5; + out1 = out6 ^ in2 ^ in7; + out0 = out2 ^ out7 ^ in3; + out3 = out1 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_AA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in1 ^ in3; + tmp2 = in6 ^ in7; + out1 = tmp0 ^ in4 ^ in7; + out3 = tmp1 ^ in0; + out0 = tmp1 ^ tmp2; + out2 = tmp2 ^ in5; + out7 = tmp0 ^ out2; + out6 = out1 ^ out7 ^ in1; + out5 = out0 ^ out6 ^ in0; + out4 = out5 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_AB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in1; + tmp0 = in1 ^ in4; + tmp1 = in0 ^ in7; + out6 = tmp0 ^ in5; + out1 = tmp0 ^ tmp1 ^ in2; + out5 = tmp1 ^ in3 ^ in4; + out0 = tmp0 ^ out5 ^ in6; + out4 = out0 ^ out3 ^ in2; + out2 = out4 ^ in3 ^ in5; + out7 = tmp1 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_AC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in3; + out1 = in2 ^ in4; + tmp0 = in0 ^ in2; + out4 = in4 ^ in7; + out5 = in0 ^ in5; + out6 = in1 ^ in6; + out7 = tmp0 ^ in7; + out3 = tmp0 ^ in3 ^ in6; + out2 = out5 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_AD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in7; + out5 = in0; + out6 = in1; + out7 = in0 ^ in2; + out0 = in0 ^ in1 ^ in3; + out2 = out7 ^ in1 ^ in5; + out1 = in1 ^ in2 ^ in4; + out3 = out7 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_AE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in3 ^ in4; + tmp0 = in0 ^ in4; + tmp1 = in0 ^ in7; + out0 = in1 ^ in3 ^ in7; + out1 = tmp0 ^ in2; + out5 = tmp0 ^ in5; + tmp2 = tmp1 ^ in6; + out2 = tmp1 ^ in5; + out3 = tmp2 ^ in3; + out7 = tmp2 ^ in2; + out6 = tmp2 ^ out2 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_AF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in3; + tmp0 = in0 ^ in7; + out5 = in0 ^ in4; + out6 = in1 ^ in5; + out7 = in0 ^ in2 ^ in6; + out0 = tmp0 ^ in1 ^ in3; + out3 = tmp0 ^ in6; + out2 = tmp0 ^ in2 ^ in5; + out1 = out5 ^ in1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in4; + tmp1 = in3 ^ in6; + out2 = tmp0 ^ in7; + tmp2 = tmp0 ^ tmp1; + out0 = tmp2 ^ in5; + out3 = tmp2 ^ in2; + out6 = out3 ^ in6; + tmp3 = out6 ^ in0 ^ in1; + out7 = tmp3 ^ in5; + out5 = tmp3 ^ out2; + out1 = out0 ^ out5 ^ in0; + out4 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in4; + out2 = tmp0 ^ in2 ^ in7; + tmp1 = out2 ^ in6; + out1 = tmp1 ^ in5; + out3 = tmp1 ^ in7; + out4 = tmp1 ^ in0; + out6 = out3 ^ in3; + out0 = out6 ^ in0 ^ in2 ^ in5; + out5 = tmp1 ^ out0 ^ in1; + out7 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in4; + tmp0 = in4 ^ in7; + tmp1 = in1 ^ in3 ^ in6; + out3 = tmp0 ^ tmp1; + tmp2 = tmp1 ^ in0; + out0 = out3 ^ in5; + out4 = tmp2 ^ in2; + tmp3 = out4 ^ in6; + out5 = tmp0 ^ tmp3; + out1 = tmp3 ^ out0; + tmp4 = out1 ^ in7; + out7 = tmp4 ^ in3; + out6 = tmp2 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in6; + out3 = tmp1 ^ in4 ^ in7; + tmp2 = tmp0 ^ out3; + out0 = tmp2 ^ in3; + out1 = tmp2 ^ in2; + out5 = out0 ^ in2 ^ in6; + out7 = tmp1 ^ out5; + out4 = out7 ^ in1 ^ in5 ^ in7; + out6 = tmp0 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in1; + out5 = out4 ^ in2; + tmp0 = out4 ^ in4; + out6 = out5 ^ in0 ^ in3; + out7 = tmp0 ^ out6; + out2 = tmp0 ^ in6 ^ in7; + out3 = out7 ^ in0 ^ in7; + out0 = out5 ^ out7 ^ in5; + out1 = out0 ^ out6 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in4; + out4 = tmp0 ^ in4; + out3 = tmp1 ^ in7; + tmp2 = out4 ^ in5; + out7 = out3 ^ in0 ^ in3; + out0 = tmp2 ^ in3; + out2 = tmp0 ^ out3 ^ in6; + out5 = tmp1 ^ tmp2; + out6 = out2 ^ out7 ^ in2; + out1 = tmp0 ^ out0 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in4; + tmp0 = in1 ^ in2; + tmp1 = in0 ^ in4; + tmp2 = in3 ^ in5; + tmp3 = out3 ^ in1 ^ in7; + out5 = tmp0 ^ tmp1; + out6 = tmp0 ^ tmp2; + out2 = tmp1 ^ in6; + out4 = tmp1 ^ tmp3; + out0 = tmp3 ^ in5; + out1 = out2 ^ in2 ^ in5; + out7 = tmp2 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in4; + tmp0 = in0 ^ in4; + out2 = tmp0 ^ in2 ^ in6; + tmp1 = out2 ^ in7; + out1 = out2 ^ in1 ^ in5; + out7 = tmp1 ^ in3; + out5 = out1 ^ in6; + out6 = tmp0 ^ out1 ^ in3; + out0 = tmp1 ^ out6; + out4 = out0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in4; + tmp1 = in2 ^ in5; + out2 = tmp0 ^ in5; + out4 = tmp1 ^ in0; + tmp2 = tmp1 ^ in7; + out6 = tmp2 ^ out2; + out7 = out4 ^ in3; + out1 = tmp2 ^ in4; + out3 = tmp0 ^ out7; + out0 = out3 ^ out4 ^ in6; + out5 = out0 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_B9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in4 ^ in5; + out4 = tmp0 ^ tmp1; + tmp2 = tmp0 ^ in3 ^ in7; + out3 = out4 ^ in1; + out7 = tmp2 ^ in5; + out2 = out3 ^ in0; + out1 = out2 ^ in7; + out6 = out1 ^ in5 ^ in6; + out0 = tmp2 ^ out6; + out5 = tmp1 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_BA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in7; + out2 = tmp0 ^ in4; + tmp1 = out2 ^ in2; + out1 = tmp1 ^ in0; + out6 = tmp1 ^ in1; + out4 = out1 ^ in3 ^ in4; + tmp2 = out4 ^ out6; + out7 = out4 ^ in6 ^ in7; + out5 = tmp2 ^ in6; + out3 = tmp0 ^ tmp2; + out0 = out6 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_BB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in4 ^ in5 ^ in7; + tmp0 = out2 ^ in1; + out4 = out2 ^ in0 ^ in3; + out1 = tmp0 ^ in0; + out6 = tmp0 ^ in6; + out3 = out1 ^ in2; + tmp1 = out4 ^ out6 ^ in4; + out0 = tmp1 ^ in7; + out5 = tmp1 ^ in5; + out7 = tmp0 ^ tmp1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_BC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in2 ^ in4; + out0 = in1 ^ in3 ^ in4; + out6 = in1 ^ in2 ^ in7; + out7 = tmp0 ^ in3; + out5 = tmp0 ^ out6 ^ in6; + out1 = tmp1 ^ in5; + tmp2 = out1 ^ out5 ^ in1; + out3 = tmp2 ^ in3; + out4 = tmp1 ^ tmp2; + out2 = tmp2 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_BD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in1 ^ in4; + out0 = tmp0 ^ tmp1; + out7 = tmp0 ^ in2 ^ in7; + out1 = tmp1 ^ in2 ^ in5; + tmp2 = out1 ^ in0; + out2 = tmp2 ^ in6; + out3 = out2 ^ in1 ^ in7; + out4 = out3 ^ in2; + out5 = tmp1 ^ out4; + out6 = tmp2 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_BE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3 ^ in6; + out4 = tmp0 ^ in5; + out7 = tmp0 ^ in2; + out3 = out4 ^ in4; + out1 = out3 ^ out7 ^ in0; + out2 = out3 ^ in3 ^ in7; + out0 = out2 ^ out4 ^ in1; + out5 = tmp0 ^ out0; + out6 = out1 ^ out5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_BF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + out3 = tmp0 ^ in5 ^ in6; + out4 = out3 ^ in3; + tmp1 = out3 ^ in7; + out2 = tmp1 ^ in2; + out5 = tmp1 ^ in1; + tmp2 = out2 ^ in5; + out7 = tmp2 ^ in3 ^ in4; + tmp3 = tmp0 ^ out5; + out0 = tmp3 ^ out4; + out1 = tmp2 ^ tmp3; + out6 = tmp3 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in2 ^ in5; + tmp0 = in1 ^ in4; + tmp1 = in3 ^ in6; + out0 = out5 ^ in1; + out4 = tmp0 ^ in7; + out3 = tmp0 ^ tmp1; + out1 = tmp1 ^ in2; + out6 = tmp1 ^ in0; + out7 = out4 ^ in0; + out2 = out4 ^ out5 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in2; + tmp0 = in0 ^ in1; + out4 = in1 ^ in7; + out6 = in0 ^ in3; + out3 = in1 ^ in4 ^ in6; + tmp1 = tmp0 ^ in2; + out7 = tmp0 ^ in4; + out0 = tmp1 ^ in5; + out1 = tmp1 ^ out6 ^ in6; + out2 = out6 ^ out7 ^ in5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in3 ^ in4; + tmp0 = in0 ^ in3 ^ in6; + out5 = in2 ^ in4 ^ in5; + tmp1 = out4 ^ in7; + out1 = tmp0 ^ in2; + out6 = tmp0 ^ in5; + out2 = out5 ^ in3; + out7 = tmp0 ^ tmp1; + out3 = tmp1 ^ in2 ^ in6; + out0 = tmp1 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in3; + tmp0 = in0 ^ in2; + tmp1 = in3 ^ in5; + out5 = in2 ^ in4; + tmp2 = tmp0 ^ out4; + out2 = tmp1 ^ in4; + out6 = tmp1 ^ in0; + out0 = tmp1 ^ tmp2 ^ in7; + out1 = tmp2 ^ in6; + out7 = out1 ^ out5 ^ in3; + out3 = tmp0 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in7; + out3 = tmp0 ^ in4; + tmp1 = tmp0 ^ in2; + out1 = tmp1 ^ in6; + out5 = tmp1 ^ in5; + out4 = out1 ^ out3 ^ in1; + out0 = out4 ^ in4 ^ in5; + out2 = out0 ^ out3 ^ in0; + out7 = out1 ^ out2 ^ in7; + out6 = tmp1 ^ out0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in4 ^ in7; + tmp0 = in3 ^ in7; + out4 = in1 ^ in2 ^ in6; + out6 = in0 ^ in3 ^ in4; + out5 = tmp0 ^ in2; + out1 = tmp0 ^ out4; + out0 = out4 ^ in0 ^ in5; + out2 = out0 ^ out5 ^ in4; + out7 = tmp0 ^ out2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in6; + tmp1 = in1 ^ in7; + tmp2 = tmp0 ^ in0; + tmp3 = tmp0 ^ tmp1; + tmp4 = tmp2 ^ in4; + out0 = tmp3 ^ in2; + out6 = tmp4 ^ in3; + out2 = out6 ^ in2; + out7 = tmp1 ^ tmp4; + out3 = tmp2 ^ out2; + tmp5 = out3 ^ in5; + out5 = tmp5 ^ in7; + out4 = tmp3 ^ tmp5; + out1 = tmp4 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in4; + tmp0 = in3 ^ in5; + tmp1 = out3 ^ in7; + out6 = tmp0 ^ in0 ^ in4; + out5 = tmp1 ^ in3; + out2 = out6 ^ in6; + out7 = out2 ^ in1 ^ in3; + out0 = tmp1 ^ out7; + out1 = tmp0 ^ out0; + out4 = out1 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in2; + out1 = in2 ^ in3; + tmp0 = in5 ^ in6; + tmp1 = in0 ^ in7; + out2 = out1 ^ in1 ^ in4; + out4 = tmp0 ^ in4; + out5 = tmp0 ^ in7; + out6 = tmp1 ^ in6; + out7 = tmp1 ^ in1; + out3 = out2 ^ in0 ^ in2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_C9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in5 ^ in6; + out7 = in0 ^ in1; + tmp0 = in1 ^ in3; + out5 = in6 ^ in7; + out6 = in0 ^ in7; + out0 = out7 ^ in2; + out3 = out7 ^ in4 ^ in5; + out1 = tmp0 ^ in2; + out2 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_CA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in7; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in6; + out0 = tmp1 ^ in1; + tmp3 = tmp1 ^ in3; + out6 = tmp2 ^ in5; + out7 = tmp2 ^ in1; + out2 = tmp3 ^ in4; + out5 = out6 ^ in0 ^ in4; + out4 = out5 ^ in3; + out1 = tmp0 ^ tmp3; + out3 = tmp3 ^ out5 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_CB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in7; + tmp1 = in5 ^ in7; + out7 = in0 ^ in1 ^ in6; + out5 = tmp0 ^ in6; + out2 = tmp0 ^ in3; + out6 = tmp1 ^ in0; + out4 = tmp1 ^ in3 ^ in6; + tmp2 = out5 ^ out7 ^ in2; + out1 = tmp2 ^ out2; + out0 = tmp2 ^ in4; + out3 = tmp2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_CC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = in1 ^ in6; + out1 = in2 ^ in3 ^ in7; + out5 = tmp0 ^ in6; + out0 = tmp1 ^ in2; + tmp2 = out5 ^ in0 ^ in7; + out3 = tmp2 ^ in4; + out6 = tmp0 ^ out3; + out7 = tmp1 ^ tmp2 ^ in3; + tmp3 = out1 ^ out6; + out4 = tmp2 ^ tmp3; + out2 = tmp3 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_CD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in3 ^ in6; + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in7; + out6 = in0 ^ in4 ^ in7; + out2 = tmp0 ^ out5 ^ in4; + out7 = tmp0 ^ in5; + out0 = tmp0 ^ in2 ^ in6; + out4 = tmp1 ^ in5; + out1 = tmp1 ^ in1 ^ in3; + out3 = out6 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_CE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = tmp0 ^ in3; + out4 = tmp1 ^ in4; + tmp2 = out4 ^ in6; + out3 = tmp2 ^ in0; + out5 = tmp2 ^ in2; + out2 = out3 ^ in5 ^ in7; + out6 = tmp1 ^ out2; + out7 = out2 ^ out4 ^ in1; + out1 = tmp2 ^ out6; + out0 = tmp0 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_CF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in6; + tmp1 = in0 ^ in1 ^ in5; + out4 = in2 ^ in3 ^ in5; + out5 = tmp0 ^ in4; + out7 = tmp1 ^ in6; + out1 = tmp1 ^ out4 ^ in7; + tmp2 = out5 ^ in0; + out2 = tmp2 ^ in7; + out3 = tmp2 ^ out4; + out6 = tmp0 ^ out2 ^ in5; + out0 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in1 ^ in4; + tmp2 = in2 ^ in5; + out7 = tmp0 ^ tmp1; + out0 = tmp1 ^ tmp2; + tmp3 = tmp2 ^ in3; + out1 = tmp3 ^ in6; + tmp4 = out1 ^ in1; + out2 = tmp4 ^ in7; + out3 = out2 ^ in2; + out4 = tmp0 ^ out3; + out5 = tmp3 ^ out3; + out6 = tmp4 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5 ^ in6; + tmp1 = tmp0 ^ in1; + out1 = tmp1 ^ in2; + out2 = tmp1 ^ in7; + out3 = out2 ^ in3; + out5 = out3 ^ in2; + tmp2 = out3 ^ in0; + out4 = tmp2 ^ in4; + out7 = tmp0 ^ out4; + out6 = tmp2 ^ out1 ^ in6; + out0 = out2 ^ out6 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in6; + out2 = tmp0 ^ in2 ^ in3; + out1 = out2 ^ in0; + out3 = out2 ^ in1; + out4 = out1 ^ in1 ^ in2; + out6 = out1 ^ in6 ^ in7; + out7 = out4 ^ in4 ^ in5; + out5 = out4 ^ out6 ^ in4; + out0 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in5 ^ in6; + tmp0 = out2 ^ in2; + tmp1 = tmp0 ^ in1; + out1 = tmp1 ^ in0; + out3 = tmp1 ^ in3; + out4 = out1 ^ in2 ^ in4; + tmp2 = out4 ^ in5; + out7 = tmp2 ^ in7; + out0 = tmp0 ^ out7; + tmp3 = out0 ^ in0; + out5 = tmp3 ^ in6; + out6 = tmp2 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in5; + tmp0 = in1 ^ in5; + tmp1 = tmp0 ^ in2; + out4 = tmp1 ^ in0; + tmp2 = tmp1 ^ in6; + out2 = out4 ^ in3 ^ in7; + out0 = tmp2 ^ in4; + out5 = tmp2 ^ out3; + out1 = tmp0 ^ out5 ^ in7; + out6 = tmp0 ^ out2 ^ in4; + out7 = tmp1 ^ out6 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in5; + tmp0 = in0 ^ in4; + tmp1 = tmp0 ^ in1 ^ in5; + out4 = tmp1 ^ in2; + out0 = out4 ^ in6; + tmp2 = tmp0 ^ out0; + out5 = tmp2 ^ in3; + out1 = out5 ^ in7; + out6 = tmp1 ^ out1; + out7 = tmp2 ^ out6; + out2 = out7 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2 ^ in4 ^ in6; + out5 = tmp0 ^ in3; + out0 = tmp0 ^ in5 ^ in7; + out3 = out0 ^ out5 ^ in2; + tmp1 = out3 ^ in0; + out1 = tmp1 ^ in6; + out2 = tmp1 ^ in7; + out4 = tmp1 ^ in1; + out6 = tmp1 ^ in4; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + out3 = in2 ^ in5 ^ in7; + out2 = tmp0 ^ in5; + tmp1 = tmp0 ^ out3 ^ in1; + out1 = tmp1 ^ in6; + out4 = tmp1 ^ in4; + tmp2 = out1 ^ in4; + out6 = tmp2 ^ in1; + out7 = tmp2 ^ in2; + out0 = tmp2 ^ in3; + out5 = tmp2 ^ in0 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0; + out5 = in1; + tmp0 = in1 ^ in2; + out6 = in0 ^ in2; + out0 = tmp0 ^ in4; + tmp1 = tmp0 ^ in3; + out7 = tmp1 ^ out6; + out2 = tmp1 ^ in6; + out3 = out7 ^ in7; + out1 = tmp1 ^ in1 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_D9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in4; + out5 = in1 ^ in5; + out2 = in1 ^ in3 ^ in6; + out3 = in0 ^ in1 ^ in7; + out6 = in0 ^ in2 ^ in6; + out0 = out4 ^ in1 ^ in2; + out1 = out5 ^ in2 ^ in3; + out7 = out3 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_DA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in1 ^ in4; + tmp0 = in2 ^ in7; + tmp1 = in0 ^ in2 ^ in3; + out0 = tmp0 ^ out5; + out4 = tmp0 ^ tmp1; + out2 = tmp0 ^ in3 ^ in6; + out1 = tmp1 ^ in5; + out3 = tmp1 ^ in1; + out6 = out1 ^ in3; + out7 = out3 ^ in2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_DB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in1 ^ in5; + tmp2 = in3 ^ in7; + out3 = tmp0 ^ in2; + out5 = tmp1 ^ in4; + out6 = tmp1 ^ out3 ^ in6; + out2 = tmp2 ^ in6; + tmp3 = tmp2 ^ in4; + tmp4 = out3 ^ in3; + out4 = tmp3 ^ in0; + out1 = tmp4 ^ in5; + out0 = tmp3 ^ tmp4; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_DC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in0 ^ in3; + out6 = tmp0 ^ in4; + tmp2 = tmp0 ^ in7; + out3 = tmp1 ^ in6; + tmp3 = tmp1 ^ in1; + out1 = tmp1 ^ tmp2 ^ in5; + out4 = tmp2 ^ in6; + out2 = tmp3 ^ in2; + out7 = tmp3 ^ in5; + out5 = tmp2 ^ out2; + out0 = out2 ^ out3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_DD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in6; + out2 = in0 ^ in1 ^ in3; + out6 = out3 ^ in2 ^ in4; + out7 = out2 ^ in5 ^ in7; + out0 = out6 ^ in1; + out4 = out6 ^ in7; + out5 = out7 ^ in0; + out1 = out5 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_DE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3 ^ in6; + tmp1 = in3 ^ in4 ^ in7; + out4 = tmp0 ^ in0; + out5 = tmp1 ^ in1; + out3 = out4 ^ in7; + out2 = out3 ^ in6; + out1 = out2 ^ in5; + out6 = tmp1 ^ out1; + out0 = tmp0 ^ out5; + out7 = out0 ^ out1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_DF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in3 ^ in7; + tmp0 = out2 ^ in1 ^ in5; + out1 = tmp0 ^ in2; + out7 = tmp0 ^ in6; + out5 = tmp0 ^ in0 ^ in4; + tmp1 = out1 ^ out5 ^ in6; + out4 = tmp1 ^ in3; + out6 = tmp1 ^ in5; + tmp2 = tmp1 ^ in7; + out0 = tmp2 ^ in1; + out3 = tmp2 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in7; + tmp0 = in2 ^ in4; + out4 = out3 ^ in3 ^ in5; + out2 = tmp0 ^ in1; + tmp1 = tmp0 ^ in6; + out0 = out4 ^ in2; + out6 = out4 ^ in0; + out1 = tmp1 ^ in3; + out5 = tmp1 ^ in0; + out7 = out5 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in4; + tmp0 = in1 ^ in7; + out3 = tmp0 ^ in3; + tmp1 = out3 ^ in5; + out4 = tmp1 ^ in4; + tmp2 = tmp1 ^ in0; + out0 = tmp2 ^ in2; + out6 = tmp2 ^ in6; + tmp3 = out0 ^ out4 ^ in6; + out5 = tmp3 ^ in5; + out7 = tmp0 ^ tmp3; + out1 = tmp2 ^ out5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in2; + out4 = in1 ^ in5; + out2 = in2 ^ in4 ^ in7; + out5 = in0 ^ in2 ^ in6; + out0 = out3 ^ in3 ^ in5; + out7 = out3 ^ in0 ^ in4; + out6 = out2 ^ out7 ^ in3; + out1 = out5 ^ in3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in4 ^ in7; + tmp0 = in1 ^ in3; + out3 = tmp0 ^ in2; + tmp1 = out3 ^ in0; + out0 = tmp1 ^ in5; + tmp2 = tmp1 ^ in4; + out1 = tmp2 ^ in6; + tmp3 = tmp2 ^ in3; + out7 = tmp3 ^ in7; + out6 = out1 ^ out2 ^ in2; + tmp4 = tmp0 ^ out0; + out5 = tmp4 ^ in6; + out4 = tmp3 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in6; + tmp0 = in0 ^ in4; + tmp1 = tmp0 ^ in2 ^ in6; + out2 = tmp1 ^ in1; + out7 = out2 ^ in5; + tmp2 = tmp0 ^ out7; + out4 = tmp2 ^ in3; + out0 = out4 ^ in7; + out6 = tmp1 ^ out0; + out5 = tmp2 ^ out6; + out1 = out5 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in6; + tmp0 = in0 ^ in1; + tmp1 = in5 ^ in7; + out2 = tmp0 ^ in4 ^ in6; + tmp2 = tmp1 ^ out2; + out6 = tmp2 ^ in3; + out7 = tmp2 ^ in2; + out0 = out6 ^ in2 ^ in4; + out5 = out6 ^ in1 ^ in2; + out1 = tmp0 ^ out5 ^ in5; + out4 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in6 ^ in7; + out2 = out3 ^ in0 ^ in4; + out4 = out3 ^ in1 ^ in5; + out1 = out2 ^ in3; + out7 = out2 ^ out4 ^ in2; + out0 = out4 ^ in3 ^ in7; + out5 = out1 ^ in4; + out6 = out0 ^ out2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out3 = tmp0 ^ in6 ^ in7; + tmp1 = out3 ^ in0; + out5 = tmp1 ^ in5; + tmp2 = tmp1 ^ in4; + tmp3 = out5 ^ in7; + out1 = tmp2 ^ in1; + out0 = tmp3 ^ in1; + out6 = out1 ^ in2; + out2 = tmp0 ^ tmp2; + tmp4 = tmp3 ^ out6; + out4 = tmp4 ^ in6; + out7 = tmp4 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in3 ^ in6; + tmp0 = in4 ^ in7; + out1 = in2 ^ in3 ^ in4; + out5 = tmp0 ^ in0; + tmp1 = tmp0 ^ in1; + tmp2 = tmp1 ^ in5; + out0 = tmp1 ^ out1; + out2 = tmp2 ^ in2; + out6 = tmp2 ^ out5; + tmp3 = out6 ^ in6; + out3 = tmp3 ^ in7; + out7 = tmp3 ^ in2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_E9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in3 ^ in6; + tmp2 = tmp0 ^ in6; + out4 = tmp1 ^ in4; + out6 = tmp2 ^ in5; + out7 = tmp2 ^ in2 ^ in7; + out3 = out6 ^ in3 ^ in7; + out0 = tmp1 ^ out7; + out2 = out3 ^ out4 ^ in0; + out5 = tmp0 ^ out2; + out1 = out0 ^ out5 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_EA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in6 ^ in7; + out5 = in0 ^ in7; + out6 = in0 ^ in1; + out0 = in1 ^ in2 ^ in3; + out2 = in2 ^ in4 ^ in5; + out7 = out6 ^ in2; + out1 = out0 ^ out6 ^ in4; + out3 = out7 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_EB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in4 ^ in5; + tmp0 = in0 ^ in1; + out4 = in4 ^ in6 ^ in7; + out5 = in0 ^ in5 ^ in7; + out6 = tmp0 ^ in6; + tmp1 = tmp0 ^ in2; + out0 = tmp1 ^ in3; + out7 = tmp1 ^ in7; + out1 = out0 ^ in4; + out3 = out0 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_EC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in5; + out4 = in2 ^ in3 ^ in7; + out5 = in0 ^ in3 ^ in4; + out6 = out3 ^ in1 ^ in4; + out1 = out4 ^ in4; + out0 = out4 ^ in1 ^ in6; + out2 = out0 ^ out5 ^ in5; + out7 = out2 ^ in4 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_ED(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = in3 ^ in5; + out4 = tmp0 ^ in3 ^ in7; + out3 = tmp1 ^ in0; + out1 = out4 ^ in1; + out5 = out3 ^ in4; + out7 = out1 ^ out5 ^ in6; + out2 = tmp0 ^ out7; + out0 = tmp1 ^ out7; + out6 = out2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_EE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2; + tmp0 = in0 ^ in1; + out5 = in0 ^ in3; + tmp1 = tmp0 ^ in2; + out6 = tmp0 ^ in4; + tmp2 = tmp1 ^ out5; + out7 = tmp1 ^ in5; + out1 = tmp2 ^ out6 ^ in7; + out0 = tmp2 ^ in6; + tmp3 = out7 ^ in1; + out3 = tmp3 ^ in7; + out2 = tmp3 ^ in4 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_EF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in4 ^ in6; + out5 = tmp0 ^ in3; + out2 = tmp0 ^ tmp1; + out6 = tmp1 ^ in0 ^ in1; + out3 = out5 ^ in2 ^ in7; + out7 = out3 ^ in1 ^ in3; + out0 = out4 ^ out6 ^ in3; + out1 = tmp1 ^ out0 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in4 ^ in5; + out2 = tmp0 ^ in6; + out3 = tmp1 ^ in1; + tmp2 = tmp1 ^ in7; + out1 = out2 ^ out3 ^ in3; + tmp3 = tmp0 ^ tmp2; + out0 = tmp3 ^ in3; + out5 = tmp3 ^ in0; + out4 = out1 ^ out5 ^ in4; + out7 = out4 ^ in2; + out6 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in6; + tmp0 = in3 ^ in5; + out3 = tmp0 ^ in1 ^ in4; + tmp1 = out3 ^ in2; + out1 = tmp1 ^ in6; + tmp2 = tmp1 ^ in0; + tmp3 = out1 ^ in5; + out0 = tmp2 ^ in7; + out6 = tmp2 ^ in4; + out7 = tmp3 ^ in0; + out5 = tmp0 ^ out0; + out4 = tmp3 ^ out5 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in5; + out2 = in2 ^ in6 ^ in7; + tmp1 = tmp0 ^ in1; + tmp2 = tmp1 ^ in2; + out0 = tmp2 ^ in3; + out3 = tmp2 ^ in7; + out5 = out3 ^ in0 ^ in4; + tmp3 = tmp0 ^ out5; + out7 = tmp3 ^ in3; + out4 = tmp3 ^ out2; + out1 = out0 ^ out4 ^ in4; + out6 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in6 ^ in7; + tmp0 = in0 ^ in1; + out4 = tmp0 ^ in6; + tmp1 = tmp0 ^ in2; + out5 = tmp1 ^ in7; + out6 = tmp1 ^ in3; + out7 = out6 ^ in4; + out0 = out7 ^ in5; + out1 = out0 ^ in6; + out3 = out0 ^ in0 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in1 ^ in2; + tmp0 = out2 ^ in3; + out4 = tmp0 ^ in4; + out5 = out4 ^ in5; + out6 = out5 ^ in6; + out7 = out6 ^ in7; + out0 = out7 ^ in0; + out1 = out0 ^ in1; + out3 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in1; + tmp0 = out2 ^ in2; + out4 = tmp0 ^ in3; + out5 = out4 ^ in4; + out6 = out5 ^ in5; + out7 = out6 ^ in6; + out0 = out7 ^ in7; + out1 = out0 ^ in0; + out3 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in7; + out2 = tmp0 ^ in2; + out4 = out2 ^ in1 ^ in4; + out7 = out4 ^ in3 ^ in5; + out5 = out7 ^ in4 ^ in7; + out0 = tmp0 ^ out7 ^ in6; + tmp1 = out0 ^ in1; + out6 = out0 ^ in0 ^ in5; + out3 = tmp1 ^ in3; + out1 = tmp0 ^ tmp1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in7; + tmp0 = out2 ^ in1; + out4 = tmp0 ^ in2; + out5 = out4 ^ in3 ^ in7; + out6 = out5 ^ in4; + out7 = out6 ^ in5; + out0 = out7 ^ in6; + out1 = out0 ^ in7; + out3 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + tmp1 = in3 ^ in5; + tmp2 = tmp0 ^ in6; + out4 = tmp0 ^ tmp1; + out1 = tmp1 ^ in2 ^ in4; + out3 = tmp2 ^ in1; + out5 = out3 ^ in5; + out7 = out1 ^ out5 ^ in7; + out6 = tmp1 ^ out7; + out0 = tmp2 ^ out7; + out2 = out6 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_F9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = in0 ^ in6; + out4 = tmp0 ^ in0; + tmp2 = tmp1 ^ in4; + tmp3 = tmp1 ^ in2; + out5 = tmp2 ^ in1; + out3 = out5 ^ in3; + tmp4 = tmp3 ^ out3; + out1 = tmp4 ^ in5; + out0 = tmp4 ^ in0 ^ in7; + out6 = tmp0 ^ out0 ^ in4; + out7 = tmp2 ^ tmp4; + out2 = tmp3 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_FA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = tmp0 ^ in2; + tmp2 = tmp0 ^ in5; + tmp3 = tmp1 ^ in7; + out5 = tmp2 ^ in6; + out6 = tmp3 ^ in6; + out7 = tmp3 ^ in3; + out3 = out6 ^ in4; + out2 = tmp1 ^ out5; + out4 = out2 ^ out3 ^ in1; + out0 = out4 ^ out7 ^ in5; + out1 = tmp2 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_FB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in5 ^ in6; + tmp0 = in0 ^ in1; + out4 = in0 ^ in5 ^ in7; + out5 = tmp0 ^ in6; + tmp1 = tmp0 ^ in2; + out6 = tmp1 ^ in7; + out7 = tmp1 ^ in3; + out0 = out7 ^ in4; + out1 = out0 ^ in5; + out3 = out0 ^ in6 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_FC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in0 ^ in7; + out2 = tmp0 ^ tmp1 ^ in5; + out3 = tmp1 ^ in4; + tmp2 = out2 ^ in6; + out6 = tmp2 ^ in4; + out7 = tmp2 ^ in3; + out4 = out6 ^ in1 ^ in3; + tmp3 = out4 ^ in0; + out1 = tmp3 ^ in6; + out0 = tmp3 ^ in1 ^ in5; + out5 = tmp0 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_FD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in7; + out2 = tmp0 ^ tmp1; + out6 = out2 ^ in2 ^ in4; + tmp2 = out6 ^ in0; + out1 = tmp2 ^ in3; + out0 = tmp0 ^ out1 ^ in6; + out5 = out0 ^ in2; + tmp3 = out5 ^ in1; + out3 = tmp3 ^ in6; + out7 = tmp2 ^ tmp3; + out4 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_FE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + out2 = tmp0 ^ in5; + out3 = tmp0 ^ in4; + tmp1 = out3 ^ in6; + out4 = tmp1 ^ in5; + tmp2 = tmp1 ^ in1; + out6 = tmp2 ^ in7; + tmp3 = tmp2 ^ in0; + out0 = tmp3 ^ in3; + tmp4 = out0 ^ out4 ^ in7; + out5 = tmp4 ^ in6; + out7 = tmp4 ^ in2; + out1 = tmp3 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void gf8_muladd_FF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in5; + tmp0 = in4 ^ in7; + tmp1 = out2 ^ in2; + out4 = tmp1 ^ in6; + out7 = tmp1 ^ in1 ^ in3; + out1 = tmp0 ^ out7; + tmp2 = out1 ^ in5; + out6 = tmp2 ^ in3; + tmp3 = tmp2 ^ in7; + out0 = tmp3 ^ in6; + out3 = tmp3 ^ in1; + out5 = tmp0 ^ out0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void (*gf8_muladd[])(void *out, void *in) = { + gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, + gf8_muladd_04, gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, + gf8_muladd_08, gf8_muladd_09, gf8_muladd_0A, gf8_muladd_0B, + gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E, gf8_muladd_0F, + gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13, + gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, + gf8_muladd_18, gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, + gf8_muladd_1C, gf8_muladd_1D, gf8_muladd_1E, gf8_muladd_1F, + gf8_muladd_20, gf8_muladd_21, gf8_muladd_22, gf8_muladd_23, + gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27, + gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, + gf8_muladd_2C, gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, + gf8_muladd_30, gf8_muladd_31, gf8_muladd_32, gf8_muladd_33, + gf8_muladd_34, gf8_muladd_35, gf8_muladd_36, gf8_muladd_37, + gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B, + gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, + gf8_muladd_40, gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, + gf8_muladd_44, gf8_muladd_45, gf8_muladd_46, gf8_muladd_47, + gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A, gf8_muladd_4B, + gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F, + gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, + gf8_muladd_54, gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, + gf8_muladd_58, gf8_muladd_59, gf8_muladd_5A, gf8_muladd_5B, + gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E, gf8_muladd_5F, + gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63, + gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, + gf8_muladd_68, gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, + gf8_muladd_6C, gf8_muladd_6D, gf8_muladd_6E, gf8_muladd_6F, + gf8_muladd_70, gf8_muladd_71, gf8_muladd_72, gf8_muladd_73, + gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77, + gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, + gf8_muladd_7C, gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, + gf8_muladd_80, gf8_muladd_81, gf8_muladd_82, gf8_muladd_83, + gf8_muladd_84, gf8_muladd_85, gf8_muladd_86, gf8_muladd_87, + gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B, + gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, + gf8_muladd_90, gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, + gf8_muladd_94, gf8_muladd_95, gf8_muladd_96, gf8_muladd_97, + gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A, gf8_muladd_9B, + gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F, + gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, + gf8_muladd_A4, gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, + gf8_muladd_A8, gf8_muladd_A9, gf8_muladd_AA, gf8_muladd_AB, + gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE, gf8_muladd_AF, + gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3, + gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, + gf8_muladd_B8, gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, + gf8_muladd_BC, gf8_muladd_BD, gf8_muladd_BE, gf8_muladd_BF, + gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2, gf8_muladd_C3, + gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7, + gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, + gf8_muladd_CC, gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, + gf8_muladd_D0, gf8_muladd_D1, gf8_muladd_D2, gf8_muladd_D3, + gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6, gf8_muladd_D7, + gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB, + gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, + gf8_muladd_E0, gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, + gf8_muladd_E4, gf8_muladd_E5, gf8_muladd_E6, gf8_muladd_E7, + gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA, gf8_muladd_EB, + gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF, + gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, + gf8_muladd_F4, gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, + gf8_muladd_F8, gf8_muladd_F9, gf8_muladd_FA, gf8_muladd_FB, + gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE, gf8_muladd_FF +}; + +static uint64_t zero[EC_METHOD_WORD_SIZE * 8] = {0, }; + +void ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count) +{ + uint32_t i, last, tmp; + + last = 1; + for (i = count; i > 0; i--) { + if (values[i - 1] != 0) { + tmp = values[i - 1]; + values[i - 1] = ec_gf_div(gf, tmp, last); + last = tmp; + } + } +} + +void ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values, + uint32_t count) +{ + src += offset; + gf8_muladd_00(dst, src); + while (--count > 0) { + src += EC_METHOD_CHUNK_SIZE; + gf8_muladd[*values](dst, src); + values++; + } +} + +void ec_code_c_interleaved(void *dst, void **src, uint64_t offset, + uint32_t *values, uint32_t count) +{ + uint32_t i, last, tmp; + + i = 0; + while ((last = *values++) == 0) { + i++; + } + gf8_muladd_00(dst, src[i++] + offset); + while (i < count) { + tmp = *values++; + if (tmp != 0) { + gf8_muladd[last](dst, src[i] + offset); + last = tmp; + } + i++; + } + gf8_muladd[last](dst, zero); +} diff --git a/xlators/cluster/ec/src/ec-code-c.h b/xlators/cluster/ec/src/ec-code-c.h new file mode 100644 index 00000000000..92e8070e514 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-c.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_C_H__ +#define __EC_CODE_C_H__ + +#include "ec-types.h" + +void ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count); + +void ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values, + uint32_t count); + +void ec_code_c_interleaved(void *dst, void **src, uint64_t offset, + uint32_t *values, uint32_t count); + +#endif /* __EC_CODE_C_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-intel.c b/xlators/cluster/ec/src/ec-code-intel.c new file mode 100644 index 00000000000..b9fdcad4421 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-intel.c @@ -0,0 +1,600 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <inttypes.h> +#include <string.h> +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_intel_init(ec_code_intel_t *intel) +{ + memset(intel, 0, sizeof(ec_code_intel_t)); +} + +static void +ec_code_intel_prefix(ec_code_intel_t *intel, uint8_t prefix) +{ + intel->prefix.data[intel->prefix.bytes++] = prefix; +} + +static void +ec_code_intel_rex(ec_code_intel_t *intel, gf_boolean_t w) +{ + gf_boolean_t present = _gf_false; + + if (w) { + intel->rex.w = 1; + present = _gf_true; + } + if (intel->modrm.present) { + if (intel->modrm.reg > 7) { + intel->modrm.reg &= 7; + intel->rex.r = 1; + present = _gf_true; + } + if (intel->sib.present) { + if (intel->sib.index > 7) { + intel->sib.index &= 7; + intel->rex.x = 1; + present = _gf_true; + } + if (intel->sib.base > 7) { + intel->sib.base &= 7; + intel->rex.b = 1; + present = _gf_true; + } + } else if (intel->modrm.rm > 7) { + intel->modrm.rm &= 7; + intel->rex.b = 1; + present = _gf_true; + } + } else if (intel->reg > 7) { + intel->reg &= 7; + intel->rex.b = 1; + present = _gf_true; + } + intel->rex.present = present; +} + +static void +ec_code_intel_vex(ec_code_intel_t *intel, gf_boolean_t w, gf_boolean_t l, + ec_code_vex_opcode_t opcode, ec_code_vex_prefix_t prefix, + uint32_t reg) +{ + ec_code_intel_rex(intel, w); + if (((intel->rex.w == 1) || + (intel->rex.x == 0) || + (intel->rex.b == 0)) || + ((opcode != VEX_OPCODE_NONE) && (opcode != VEX_OPCODE_0F))) { + intel->rex.present = _gf_false; + + intel->vex.bytes = 3; + intel->vex.data[0] = 0xC4; + intel->vex.data[1] = ((intel->rex.r << 7) | (intel->rex.x << 6) | + (intel->rex.b << 5) | opcode) ^ 0xE0; + intel->vex.data[2] = (intel->rex.w << 7) | ((~reg & 0x0F) << 3) | + (l ? 0x04 : 0x00) | prefix; + } else { + intel->vex.bytes = 2; + intel->vex.data[0] = 0xC5; + intel->vex.data[1] = (intel->rex.r << 7) | ((~reg & 0x0F) << 3) | + (l ? 0x04 : 0x00) | prefix; + } +} + +static void +ec_code_intel_modrm_reg(ec_code_intel_t *intel, uint32_t rm, uint32_t reg) +{ + intel->modrm.present = _gf_true; + intel->modrm.mod = 3; + intel->modrm.rm = rm; + intel->modrm.reg = reg; +} + +static void +ec_code_intel_modrm_mem(ec_code_intel_t *intel, uint32_t reg, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + if (index == REG_SP) { + intel->invalid = _gf_true; + return; + } + if ((index != REG_NULL) && (scale != 1) && (scale != 2) && (scale != 4) && + (scale != 8)) { + intel->invalid = _gf_true; + return; + } + scale >>= 1; + if (scale == 4) { + scale = 3; + } + + intel->modrm.present = _gf_true; + intel->modrm.reg = reg; + + intel->offset.value = offset; + if ((offset == 0) && (base != REG_BP)) { + intel->modrm.mod = 0; + intel->offset.bytes = 0; + } else if ((offset >= -128) && (offset <= 127)) { + intel->modrm.mod = 1; + intel->offset.bytes = 1; + } else { + intel->modrm.mod = 2; + intel->offset.bytes = 4; + } + + intel->modrm.rm = base; + if ((index != REG_NULL) || (base == REG_SP)) { + intel->modrm.rm = 4; + intel->sib.present = _gf_true; + intel->sib.index = index; + if (index == REG_NULL) { + intel->sib.index = 4; + } + intel->sib.scale = scale; + intel->sib.base = base; + if (base == REG_NULL) { + intel->sib.base = 5; + intel->modrm.mod = 0; + intel->offset.bytes = 4; + } + } else if (base == REG_NULL) { + intel->modrm.mod = 0; + intel->modrm.rm = 5; + intel->offset.bytes = 4; + } +} + +static void +ec_code_intel_op_1(ec_code_intel_t *intel, uint8_t opcode, uint32_t reg) +{ + intel->reg = reg; + intel->opcode.bytes = 1; + intel->opcode.data[0] = opcode; +} + +static void +ec_code_intel_op_2(ec_code_intel_t *intel, uint8_t opcode1, uint8_t opcode2, + uint32_t reg) +{ + intel->reg = reg; + intel->opcode.bytes = 2; + intel->opcode.data[0] = opcode1; + intel->opcode.data[1] = opcode2; +} + +static void +ec_code_intel_immediate_1(ec_code_intel_t *intel, uint32_t value) +{ + intel->immediate.bytes = 1; + intel->immediate.value = value; +} + +static void +ec_code_intel_immediate_2(ec_code_intel_t *intel, uint32_t value) +{ + intel->immediate.bytes = 2; + intel->immediate.value = value; +} + +static void +ec_code_intel_immediate_4(ec_code_intel_t *intel, uint32_t value) +{ + intel->immediate.bytes = 4; + intel->immediate.value = value; +} + +static void +ec_code_intel_emit(ec_code_builder_t *builder, ec_code_intel_t *intel) +{ + uint8_t insn[15]; + uint32_t i, count; + + if (intel->invalid) { + ec_code_error(builder, EINVAL); + return; + } + + count = 0; + for (i = 0; i < intel->prefix.bytes; i++) { + insn[count++] = intel->prefix.data[i]; + } + for (i = 0; i < intel->vex.bytes; i++) { + insn[count++] = intel->vex.data[i]; + } + if (intel->rex.present) { + insn[count++] = 0x40 | + (intel->rex.w << 3) | + (intel->rex.r << 2) | + (intel->rex.x << 1) | + (intel->rex.b << 0); + } + for (i = 0; i < intel->opcode.bytes; i++) { + insn[count++] = intel->opcode.data[i]; + } + if (intel->modrm.present) { + insn[count++] = (intel->modrm.mod << 6) | + (intel->modrm.reg << 3) | + (intel->modrm.rm << 0); + if (intel->sib.present) { + insn[count++] = (intel->sib.scale << 6) | + (intel->sib.index << 3) | + (intel->sib.base << 0); + } + } + for (i = 0; i < intel->offset.bytes; i++) { + insn[count++] = intel->offset.data[i]; + } + for (i = 0; i < intel->immediate.bytes; i++) { + insn[count++] = intel->immediate.data[i]; + } + + ec_code_emit(builder, insn, count); +} + +void +ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_op_1(&intel, 0x50 | (reg & 7), reg); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_op_1(&intel, 0x58 | (reg & 7), reg); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + if (size == 0) { + ec_code_intel_op_1(&intel, 0xC3, 0); + } else { + ec_code_intel_immediate_2(&intel, size); + ec_code_intel_op_1(&intel, 0xC2, 0); + } + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, dst, src); + ec_code_intel_op_1(&intel, 0x89, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x89, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x8B, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, dst, src); + ec_code_intel_op_1(&intel, 0x31, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x33, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value, + ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + if ((value >= -128) && (value < 128)) { + ec_code_intel_modrm_reg(&intel, reg, 0); + ec_code_intel_op_1(&intel, 0x83, 0); + ec_code_intel_immediate_1(&intel, value); + } else { + if (reg == REG_AX) { + ec_code_intel_op_1(&intel, 0x05, reg); + } else { + ec_code_intel_modrm_reg(&intel, reg, 0); + ec_code_intel_op_1(&intel, 0x81, 0); + } + ec_code_intel_immediate_4(&intel, value); + } + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value, + ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + if (reg == REG_AX) { + ec_code_intel_op_1(&intel, 0xA9, reg); + } else { + ec_code_intel_modrm_reg(&intel, reg, 0); + ec_code_intel_op_1(&intel, 0xF7, 0); + } + ec_code_intel_immediate_4(&intel, value); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address) +{ + ec_code_intel_t intel; + int32_t rel; + + ec_code_intel_init(&intel); + + rel = address - builder->address - 2; + if ((rel >= -128) && (rel < 128)) { + ec_code_intel_op_1(&intel, 0x75, 0); + ec_code_intel_immediate_1(&intel, rel); + } else { + rel -= 4; + ec_code_intel_op_2(&intel, 0x0F, 0x85, 0); + ec_code_intel_immediate_4(&intel, rel); + } + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); + ec_code_intel_op_2(&intel, 0x0F, 0x7F, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_1(&intel, 0x6F, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, + VEX_PREFIX_66, VEX_REG_NONE); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x7F, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, + VEX_PREFIX_66, VEX_REG_NONE); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x6F, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, + VEX_PREFIX_66, VEX_REG_NONE); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_1(&intel, 0xEF, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, + VEX_PREFIX_66, dst); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0xEF, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, + VEX_PREFIX_66, dst); + + ec_code_intel_emit(builder, &intel); +} diff --git a/xlators/cluster/ec/src/ec-code-intel.h b/xlators/cluster/ec/src/ec-code-intel.h new file mode 100644 index 00000000000..903d023f962 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-intel.h @@ -0,0 +1,184 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_INTEL_H__ +#define __EC_CODE_INTEL_H__ + +#include "ec-code.h" + +#define VEX_REG_NONE 0 + +enum _ec_code_intel_reg; +typedef enum _ec_code_intel_reg ec_code_intel_reg_t; + +enum _ec_code_vex_prefix; +typedef enum _ec_code_vex_prefix ec_code_vex_prefix_t; + +enum _ec_code_vex_opcode; +typedef enum _ec_code_vex_opcode ec_code_vex_opcode_t; + +struct _ec_code_intel_buffer; +typedef struct _ec_code_intel_buffer ec_code_intel_buffer_t; + +struct _ec_code_intel_sib; +typedef struct _ec_code_intel_sib ec_code_intel_sib_t; + +struct _ec_code_intel_modrm; +typedef struct _ec_code_intel_modrm ec_code_intel_modrm_t; + +struct _ec_code_intel_rex; +typedef struct _ec_code_intel_rex ec_code_intel_rex_t; + +struct _ec_code_intel; +typedef struct _ec_code_intel ec_code_intel_t; + +enum _ec_code_intel_reg { + REG_NULL = -1, + REG_AX, + REG_CX, + REG_DX, + REG_BX, + REG_SP, + REG_BP, + REG_SI, + REG_DI, + REG_8, + REG_9, + REG_10, + REG_11, + REG_12, + REG_13, + REG_14, + REG_15 +}; + +enum _ec_code_vex_prefix { + VEX_PREFIX_NONE = 0, + VEX_PREFIX_66, + VEX_PREFIX_F3, + VEX_PREFIX_F2 +}; + +enum _ec_code_vex_opcode { + VEX_OPCODE_NONE = 0, + VEX_OPCODE_0F, + VEX_OPCODE_0F_38, + VEX_OPCODE_0F_3A +}; + +struct _ec_code_intel_buffer { + uint32_t bytes; + union { + uint8_t data[4]; + uint32_t value; + }; +}; + +struct _ec_code_intel_sib { + gf_boolean_t present; + uint32_t base; + uint32_t index; + uint32_t scale; +}; + +struct _ec_code_intel_modrm { + gf_boolean_t present; + uint32_t mod; + uint32_t rm; + uint32_t reg; +}; + +struct _ec_code_intel_rex { + gf_boolean_t present; + uint32_t w; + uint32_t r; + uint32_t x; + uint32_t b; +}; + +struct _ec_code_intel { + gf_boolean_t invalid; + ec_code_intel_buffer_t prefix; + ec_code_intel_buffer_t opcode; + ec_code_intel_buffer_t offset; + ec_code_intel_buffer_t immediate; + ec_code_intel_buffer_t vex; + ec_code_intel_rex_t rex; + ec_code_intel_modrm_t modrm; + ec_code_intel_sib_t sib; + uint32_t reg; +}; + +void ec_code_intel_op_push_r(ec_code_builder_t *builder, + ec_code_intel_reg_t reg); +void ec_code_intel_op_pop_r(ec_code_builder_t *builder, + ec_code_intel_reg_t reg); +void ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size); + +void ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, + ec_code_intel_reg_t src, + ec_code_intel_reg_t dst); +void ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, + ec_code_intel_reg_t src, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset); +void ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst); +void ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, + ec_code_intel_reg_t src, + ec_code_intel_reg_t dst); +void ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst); +void ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value, + ec_code_intel_reg_t reg); +void ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value, + ec_code_intel_reg_t reg); +void ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address); + +void ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset); +void ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); +void ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); + +void ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset); +void ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); +void ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, + ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); + +#endif /* __EC_CODE_INTEL_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-sse.c b/xlators/cluster/ec/src/ec-code-sse.c new file mode 100644 index 00000000000..6f2c6fa593f --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-sse.c @@ -0,0 +1,108 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_sse_prolog(ec_code_builder_t *builder) +{ + builder->loop = builder->address; +} + +static void +ec_code_sse_epilog(ec_code_builder_t *builder) +{ + ec_code_intel_op_add_i2r(builder, 16, REG_DX); + ec_code_intel_op_add_i2r(builder, 16, REG_DI); + ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); + ec_code_intel_op_jne(builder, builder->loop); + + ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_sse_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_mov_m2sse(builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + + bit * builder->width, + dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_mov_m2sse(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static void +ec_code_sse_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ + ec_code_intel_op_mov_sse2m(builder, src, REG_DI, REG_NULL, 0, + bit * builder->width); +} + +static void +ec_code_sse_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_mov_sse2sse(builder, src, dst); +} + +static void +ec_code_sse_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_xor_sse2sse(builder, src, dst); +} + +static void +ec_code_sse_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_xor_m2sse(builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + + bit * builder->width, + dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_xor_m2sse(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static char *ec_code_sse_needed_flags[] = { + "sse2", + NULL +}; + +ec_code_gen_t ec_code_gen_sse = { + .name = "sse", + .flags = ec_code_sse_needed_flags, + .width = 16, + .prolog = ec_code_sse_prolog, + .epilog = ec_code_sse_epilog, + .load = ec_code_sse_load, + .store = ec_code_sse_store, + .copy = ec_code_sse_copy, + .xor2 = ec_code_sse_xor2, + .xor3 = NULL, + .xorm = ec_code_sse_xorm +}; diff --git a/xlators/cluster/ec/src/ec-code-sse.h b/xlators/cluster/ec/src/ec-code-sse.h new file mode 100644 index 00000000000..f1acbcf894b --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-sse.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_SSE_H__ +#define __EC_CODE_SSE_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_sse; + +#endif /* __EC_CODE_SSE_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-x64.c b/xlators/cluster/ec/src/ec-code-x64.c new file mode 100644 index 00000000000..e94ddd4b155 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-x64.c @@ -0,0 +1,150 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static ec_code_intel_reg_t ec_code_x64_regmap[] = { + REG_AX, REG_CX, REG_BP, REG_8, REG_9, REG_10, + REG_11, REG_12, REG_13, REG_14, REG_15 +}; + +static void +ec_code_x64_prolog(ec_code_builder_t *builder) +{ + uint32_t i; + + ec_code_intel_op_push_r(builder, REG_BP); + if (!builder->linear) { + ec_code_intel_op_push_r(builder, REG_BX); + } + if (builder->regs > 11) { + ec_code_error(builder, EINVAL); + return; + } + for (i = 7; i < builder->regs; i++) { + ec_code_intel_op_push_r(builder, ec_code_x64_regmap[i]); + } + + builder->loop = builder->address; +} + +static void +ec_code_x64_epilog(ec_code_builder_t *builder) +{ + uint32_t i; + + ec_code_intel_op_add_i2r(builder, 8, REG_DX); + ec_code_intel_op_add_i2r(builder, 8, REG_DI); + ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); + ec_code_intel_op_jne(builder, builder->loop); + + if (builder->regs > 11) { + ec_code_error(builder, EINVAL); + } + for (i = builder->regs; i > 7; i--) { + ec_code_intel_op_pop_r(builder, ec_code_x64_regmap[i - 1]); + } + if (!builder->linear) { + ec_code_intel_op_pop_r(builder, REG_BX); + } + ec_code_intel_op_pop_r(builder, REG_BP); + ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_x64_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + dst = ec_code_x64_regmap[dst]; + + if (builder->linear) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + + bit * builder->width, + dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_BX); + builder->base = idx; + } + ec_code_intel_op_mov_m2r(builder, REG_BX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static void +ec_code_x64_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ + src = ec_code_x64_regmap[src]; + + ec_code_intel_op_mov_r2m(builder, src, REG_DI, REG_NULL, 0, + bit * builder->width); +} + +static void +ec_code_x64_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + dst = ec_code_x64_regmap[dst]; + src = ec_code_x64_regmap[src]; + + ec_code_intel_op_mov_r2r(builder, src, dst); +} + +static void +ec_code_x64_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + dst = ec_code_x64_regmap[dst]; + src = ec_code_x64_regmap[src]; + + ec_code_intel_op_xor_r2r(builder, src, dst); +} + +static void +ec_code_x64_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + dst = ec_code_x64_regmap[dst]; + + if (builder->linear) { + ec_code_intel_op_xor_m2r(builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + + bit * builder->width, + dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_BX); + builder->base = idx; + } + ec_code_intel_op_xor_m2r(builder, REG_BX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static char *ec_code_x64_needed_flags[] = { + NULL +}; + +ec_code_gen_t ec_code_gen_x64 = { + .name = "x64", + .flags = ec_code_x64_needed_flags, + .width = sizeof(uint64_t), + .prolog = ec_code_x64_prolog, + .epilog = ec_code_x64_epilog, + .load = ec_code_x64_load, + .store = ec_code_x64_store, + .copy = ec_code_x64_copy, + .xor2 = ec_code_x64_xor2, + .xor3 = NULL, + .xorm = ec_code_x64_xorm +}; diff --git a/xlators/cluster/ec/src/ec-code-x64.h b/xlators/cluster/ec/src/ec-code-x64.h new file mode 100644 index 00000000000..bd8174e4bf5 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-x64.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_X64_H__ +#define __EC_CODE_X64_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_x64; + +#endif /* __EC_CODE_X64_H__ */ diff --git a/xlators/cluster/ec/src/ec-code.c b/xlators/cluster/ec/src/ec-code.c new file mode 100644 index 00000000000..a1f652779f3 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code.c @@ -0,0 +1,904 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include "syscall.h" + +#include "ec-mem-types.h" +#include "ec-code.h" +#include "ec-messages.h" +#include "ec-code-c.h" + +#ifdef USE_EC_DYNAMIC_X64 +#include "ec-code-x64.h" +#endif + +#ifdef USE_EC_DYNAMIC_SSE +#include "ec-code-sse.h" +#endif + +#ifdef USE_EC_DYNAMIC_AVX +#include "ec-code-avx.h" +#endif + +#define EC_PROC_BUFFER_SIZE 4096 + +#define PROC_CPUINFO "/proc/cpuinfo" + +struct _ec_code_proc; +typedef struct _ec_code_proc ec_code_proc_t; + +struct _ec_code_proc { + int32_t fd; + gf_boolean_t eof; + gf_boolean_t error; + gf_boolean_t skip; + ssize_t size; + ssize_t pos; + char buffer[EC_PROC_BUFFER_SIZE]; +}; + +static ec_code_gen_t *ec_code_gen_table[] = { +#ifdef USE_EC_DYNAMIC_AVX + &ec_code_gen_avx, +#endif +#ifdef USE_EC_DYNAMIC_SSE + &ec_code_gen_sse, +#endif +#ifdef USE_EC_DYNAMIC_X64 + &ec_code_gen_x64, +#endif + NULL +}; + +static void +ec_code_arg_set(ec_code_arg_t *arg, uint32_t value) +{ + arg->value = value; +} + +static void +ec_code_arg_assign(ec_code_builder_t *builder, ec_code_op_t *op, + ec_code_arg_t *arg, uint32_t reg) +{ + arg->value = reg; + + if (builder->regs <= reg) { + builder->regs = reg + 1; + } + +} + +static void +ec_code_arg_use(ec_code_builder_t *builder, ec_code_op_t *op, + ec_code_arg_t *arg, uint32_t reg) +{ + arg->value = reg; +} + +static void +ec_code_arg_update(ec_code_builder_t *builder, ec_code_op_t *op, + ec_code_arg_t *arg, uint32_t reg) +{ + arg->value = reg; +} + +static ec_code_op_t * +ec_code_op_next(ec_code_builder_t *builder) +{ + ec_code_op_t *op; + + op = &builder->ops[builder->count++]; + memset(op, 0, sizeof(ec_code_op_t)); + + return op; +} + +static void +ec_code_load(ec_code_builder_t *builder, uint32_t bit, uint32_t offset) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_LOAD; + ec_code_arg_assign(builder, op, &op->arg1, builder->map[bit]); + ec_code_arg_set(&op->arg2, offset); + ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_store(ec_code_builder_t *builder, uint32_t reg, uint32_t bit) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_STORE; + ec_code_arg_use(builder, op, &op->arg1, builder->map[reg]); + ec_code_arg_set(&op->arg2, 0); + ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_COPY; + ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]); + ec_code_arg_use(builder, op, &op->arg2, builder->map[src]); + ec_code_arg_set(&op->arg3, 0); +} + +static void +ec_code_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_XOR2; + ec_code_arg_update(builder, op, &op->arg1, builder->map[dst]); + ec_code_arg_use(builder, op, &op->arg2, builder->map[src]); + ec_code_arg_set(&op->arg3, 0); +} + +static void +ec_code_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, + uint32_t src2) +{ + ec_code_op_t *op; + + if (builder->code->gen->xor3 == NULL) { + ec_code_copy(builder, dst, src1); + ec_code_xor2(builder, dst, src2); + + return; + } + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_XOR3; + ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]); + ec_code_arg_use(builder, op, &op->arg2, builder->map[src1]); + ec_code_arg_use(builder, op, &op->arg3, builder->map[src2]); +} + +static void +ec_code_xorm(ec_code_builder_t *builder, uint32_t bit, uint32_t offset) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_XORM; + ec_code_arg_update(builder, op, &op->arg1, builder->map[bit]); + ec_code_arg_set(&op->arg2, offset); + ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_dup(ec_code_builder_t *builder, ec_gf_op_t *op) +{ + switch (op->op) { + case EC_GF_OP_COPY: + ec_code_copy(builder, op->arg1, op->arg2); + break; + case EC_GF_OP_XOR2: + ec_code_xor2(builder, op->arg1, op->arg2); + break; + case EC_GF_OP_XOR3: + ec_code_xor3(builder, op->arg1, op->arg2, op->arg3); + break; + default: + break; + } +} + +static void +ec_code_gf_load(ec_code_builder_t *builder, uint32_t offset) +{ + uint32_t i; + + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_load(builder, i, offset); + } +} + +static void +ec_code_gf_load_xor(ec_code_builder_t *builder, uint32_t offset) +{ + uint32_t i; + + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_xorm(builder, i, offset); + } +} + +static void +ec_code_gf_store(ec_code_builder_t *builder) +{ + uint32_t i; + + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_store(builder, i, i); + } +} + +static void +ec_code_gf_clear(ec_code_builder_t *builder) +{ + uint32_t i; + + ec_code_xor2(builder, 0, 0); + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_store(builder, 0, i); + } +} + +static void +ec_code_gf_mul(ec_code_builder_t *builder, uint32_t value) +{ + ec_gf_mul_t *mul; + ec_gf_op_t *op; + uint32_t map[EC_GF_MAX_REGS]; + int32_t i; + + mul = builder->code->gf->table[value]; + for (op = mul->ops; op->op != EC_GF_OP_END; op++) { + ec_code_dup(builder, op); + } + + for (i = 0; i < mul->regs; i++) { + map[i] = builder->map[mul->map[i]]; + } + memcpy(builder->map, map, sizeof(uint32_t) * mul->regs); +} + +static ec_code_builder_t * +ec_code_prepare(ec_code_t *code, uint32_t count, uint32_t width, + gf_boolean_t linear) +{ + ec_code_builder_t *builder; + uint32_t i; + + count *= code->gf->bits + code->gf->max_ops; + count += code->gf->bits; + builder = GF_MALLOC(sizeof(ec_code_builder_t) + + sizeof(ec_code_op_t) * count, ec_mt_ec_code_builder_t); + if (builder == NULL) { + return NULL; + } + + builder->address = 0; + builder->code = code; + builder->size = 0; + builder->count = 0; + builder->regs = 0; + builder->error = 0; + builder->bits = code->gf->bits; + builder->width = width; + builder->data = NULL; + builder->linear = linear; + builder->base = -1; + + for (i = 0; i < EC_GF_MAX_REGS; i++) { + builder->map[i] = i; + } + + return builder; +} + +static size_t +ec_code_space_size(void) +{ + return (sizeof(ec_code_space_t) + 15) & ~15; +} + +static size_t +ec_code_chunk_size(void) +{ + return (sizeof(ec_code_chunk_t) + 15) & ~15; +} + +static ec_code_chunk_t * +ec_code_chunk_from_space(ec_code_space_t *space) +{ + return (ec_code_chunk_t *)((uintptr_t)space + ec_code_space_size()); +} + +static void * +ec_code_func_from_chunk(ec_code_chunk_t *chunk) +{ + return (void *)((uintptr_t)chunk + ec_code_chunk_size()); +} + +static ec_code_chunk_t * +ec_code_chunk_from_func(ec_code_func_linear_t func) +{ + return (ec_code_chunk_t *)((uintptr_t)func - ec_code_chunk_size()); +} + +static ec_code_chunk_t * +ec_code_chunk_split(ec_code_chunk_t *chunk, size_t size) +{ + ec_code_chunk_t *extra; + ssize_t avail; + + avail = chunk->size - size - ec_code_chunk_size(); + if (avail > 0) { + extra = (ec_code_chunk_t *)((uintptr_t)chunk + chunk->size - avail); + extra->size = avail; + list_add(&extra->list, &chunk->list); + chunk->size = size; + } + list_del_init(&chunk->list); + + return chunk; +} + +static gf_boolean_t +ec_code_chunk_touch(ec_code_chunk_t *prev, ec_code_chunk_t *next) +{ + uintptr_t end; + + end = (uintptr_t)prev + ec_code_chunk_size() + prev->size; + return (end == (uintptr_t)next); +} + +static void +ec_code_chunk_merge(ec_code_chunk_t *chunk) +{ + ec_code_chunk_t *item; + + list_for_each_entry(item, &chunk->space->chunks, list) { + if (ec_code_chunk_touch(item, chunk)) { + item->size += chunk->size + ec_code_chunk_size(); + chunk = item; + + goto check; + } + if ((uintptr_t)item > (uintptr_t)chunk) { + list_add_tail(&chunk->list, &item->list); + if (ec_code_chunk_touch(chunk, item)) { + chunk->size += item->size + ec_code_chunk_size(); + list_del_init(&item->list); + } + + goto check; + } + } + list_add_tail(&chunk->list, &chunk->space->chunks); + +check: + if (chunk->size == EC_CODE_SIZE - ec_code_space_size() - + ec_code_chunk_size()) { + list_del_init(&chunk->space->list); + + munmap(chunk->space, chunk->space->size); + } +} + +static ec_code_chunk_t * +ec_code_space_alloc(ec_code_t *code, size_t size) +{ + ec_code_space_t *space; + ec_code_chunk_t *chunk; + size_t map_size; + + size = (size + 15) & ~15; + list_for_each_entry(space, &code->spaces, list) { + list_for_each_entry(chunk, &space->chunks, list) { + if (chunk->size >= size) { + goto out; + } + } + } + + map_size = EC_CODE_SIZE; + if (map_size < size) { + map_size = size; + } + space = mmap(NULL, map_size, PROT_EXEC | PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (space == NULL) { + return NULL; + } + /* It's not important to check the return value of mlock(). If it fails + * everything will continue to work normally. */ + mlock(space, map_size); + + space->code = code; + space->size = map_size; + list_add_tail(&space->list, &code->spaces); + INIT_LIST_HEAD(&space->chunks); + + chunk = ec_code_chunk_from_space(space); + chunk->size = EC_CODE_SIZE - ec_code_space_size() - ec_code_chunk_size(); + list_add(&chunk->list, &space->chunks); + +out: + chunk->space = space; + + return ec_code_chunk_split(chunk, size); +} + +static ec_code_chunk_t * +ec_code_alloc(ec_code_t *code, uint32_t size) +{ + ec_code_chunk_t *chunk; + + LOCK(&code->lock); + + chunk = ec_code_space_alloc(code, size); + + UNLOCK(&code->lock); + + return chunk; +} + +static void +ec_code_free(ec_code_chunk_t *chunk) +{ + gf_lock_t *lock; + + lock = &chunk->space->code->lock; + LOCK(lock); + + ec_code_chunk_merge(chunk); + + UNLOCK(lock); +} + +static gf_boolean_t +ec_code_write(ec_code_builder_t *builder) +{ + ec_code_gen_t *gen; + ec_code_op_t *op; + uint32_t i; + + builder->error = 0; + builder->size = 0; + builder->address = 0; + builder->base = -1; + + gen = builder->code->gen; + gen->prolog(builder); + for (i = 0; i < builder->count; i++) { + op = &builder->ops[i]; + switch (op->op) { + case EC_GF_OP_LOAD: + gen->load(builder, op->arg1.value, op->arg2.value, op->arg3.value); + break; + case EC_GF_OP_STORE: + gen->store(builder, op->arg1.value, op->arg3.value); + break; + case EC_GF_OP_COPY: + gen->copy(builder, op->arg1.value, op->arg2.value); + break; + case EC_GF_OP_XOR2: + gen->xor2(builder, op->arg1.value, op->arg2.value); + break; + case EC_GF_OP_XOR3: + gen->xor3(builder, op->arg1.value, op->arg2.value, op->arg3.value); + break; + case EC_GF_OP_XORM: + gen->xorm(builder, op->arg1.value, op->arg2.value, op->arg3.value); + break; + default: + break; + } + } + gen->epilog(builder); + + return builder->error == 0; +} + +static void * +ec_code_compile(ec_code_builder_t *builder) +{ + ec_code_chunk_t *chunk; + void *func; + + if (!ec_code_write(builder)) { + return NULL; + } + + chunk = ec_code_alloc(builder->code, builder->size); + if (chunk == NULL) { + return NULL; + } + func = ec_code_func_from_chunk(chunk); + builder->data = (uint8_t *)func; + + if (!ec_code_write(builder)) { + ec_code_free(chunk); + + return NULL; + } + + GF_FREE(builder); + + return func; +} + +ec_code_t * +ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen) +{ + ec_code_t *code; + + code = GF_MALLOC(sizeof(ec_code_t), ec_mt_ec_code_t); + if (code == NULL) { + return NULL; + } + memset(code, 0, sizeof(ec_code_t)); + INIT_LIST_HEAD(&code->spaces); + LOCK_INIT(&code->lock); + + code->gf = gf; + code->gen = gen; + if (gen == NULL) { + code->width = sizeof(uint64_t); + } else { + code->width = gen->width; + } + + return code; +} + +void +ec_code_destroy(ec_code_t *code) +{ + if (!list_empty(&code->spaces)) { + } + + LOCK_DESTROY(&code->lock); + + GF_FREE(code); +} + +static uint32_t +ec_code_value_next(uint32_t *values, uint32_t count, uint32_t *offset) +{ + uint32_t i, next; + + next = 0; + for (i = *offset + 1; i < count; i++) { + next = values[i]; + if (next != 0) { + break; + } + } + *offset = i; + + return next; +} + +void * +ec_code_build(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count, gf_boolean_t linear) +{ + ec_code_builder_t *builder; + uint32_t offset, val, next; + + if (code->gen == NULL) { + ec_code_c_prepare(code->gf, values, count); + if (linear) { + return ec_code_c_linear; + } else { + return ec_code_c_interleaved; + } + } + + builder = ec_code_prepare(code, count, width, linear); + if (builder == NULL) { + return NULL; + } + + offset = -1; + next = ec_code_value_next(values, count, &offset); + if (next != 0) { + ec_code_gf_load(builder, offset); + do { + val = next; + next = ec_code_value_next(values, count, &offset); + if (next != 0) { + ec_code_gf_mul(builder, ec_gf_div(code->gf, val, next)); + ec_code_gf_load_xor(builder, offset); + } + } while (next != 0); + ec_code_gf_mul(builder, val); + ec_code_gf_store(builder); + } else { + ec_code_gf_clear(builder); + } + + return ec_code_compile(builder); +} + +ec_code_func_linear_t +ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count) +{ + return (ec_code_func_linear_t)ec_code_build(code, width, values, count, + _gf_true); +} + +ec_code_func_interleaved_t +ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count) +{ + return (ec_code_func_interleaved_t)ec_code_build(code, width, values, + count, _gf_false); +} + +void +ec_code_release(ec_code_t *code, ec_code_func_t *func) +{ + if (code->gen != NULL) { + ec_code_free(ec_code_chunk_from_func(func->linear)); + } +} + +void +ec_code_error(ec_code_builder_t *builder, int32_t error) +{ + if (builder->error == 0) { + builder->error = error; + } +} + +void +ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count) +{ + if (builder->error != 0) { + return; + } + + if (builder->data != NULL) { + memcpy(builder->data + builder->size, bytes, count); + } + + builder->size += count; + builder->address += count; +} + +static char * +ec_code_proc_trim_left(char *text, ssize_t *length) +{ + ssize_t len; + + for (len = *length; (len > 0) && isspace(*text); len--) { + text++; + } + *length = len; + + return text; +} + +static char * +ec_code_proc_trim_right(char *text, ssize_t *length, char sep) +{ + char *last; + ssize_t len; + + len = *length; + + last = text; + for (len = *length; (len > 0) && (*text != sep); len--) { + if (!isspace(*text)) { + last = text + 1; + } + text++; + } + *last = 0; + *length = len; + + return text; +} + +static char * +ec_code_proc_line_parse(ec_code_proc_t *file, ssize_t *length) +{ + char *text, *end; + ssize_t len; + + len = file->size - file->pos; + text = ec_code_proc_trim_left(file->buffer + file->pos, &len); + end = ec_code_proc_trim_right(text, &len, '\n'); + if (len == 0) { + if (!file->eof) { + if (text == file->buffer) { + file->size = file->pos = 0; + file->skip = _gf_true; + } else { + file->size = file->pos = end - text; + memmove(file->buffer, text, file->pos + 1); + } + len = sys_read(file->fd, file->buffer + file->pos, + sizeof(file->buffer) - file->pos - 1); + if (len > 0) { + file->size += len; + } + file->error = len < 0; + file->eof = len <= 0; + + return NULL; + } + file->size = file->pos = 0; + } else { + file->pos = end - file->buffer + 1; + } + + *length = end - text; + + if (file->skip) { + file->skip = _gf_false; + text = NULL; + } + + return text; +} + +static char * +ec_code_proc_line(ec_code_proc_t *file, ssize_t *length) +{ + char *text; + + text = NULL; + while (!file->eof) { + text = ec_code_proc_line_parse(file, length); + if (text != NULL) { + break; + } + } + + return text; +} + +static char * +ec_code_proc_split(char *text, ssize_t *length, char sep) +{ + text = ec_code_proc_trim_right(text, length, sep); + if (*length == 0) { + return NULL; + } + (*length)--; + text++; + + return ec_code_proc_trim_left(text, length); +} + +static uint32_t +ec_code_cpu_check(uint32_t idx, char *list, uint32_t count) +{ + ec_code_gen_t *gen; + char **ptr; + char *table[count]; + uint32_t i; + + for (i = 0; i < count; i++) { + table[i] = list; + list += strlen(list) + 1; + } + + gen = ec_code_gen_table[idx]; + while (gen != NULL) { + for (ptr = gen->flags; *ptr != NULL; ptr++) { + for (i = 0; i < count; i++) { + if (strcmp(*ptr, table[i]) == 0) { + break; + } + } + if (i >= count) { + gen = ec_code_gen_table[++idx]; + break; + } + } + if (*ptr == NULL) { + break; + } + } + + return idx; +} + +ec_code_gen_t * +ec_code_detect(xlator_t *xl, const char *def) +{ + ec_code_proc_t file; + ec_code_gen_t *gen = NULL; + char *line, *data, *list; + ssize_t length; + uint32_t count, base, select; + + if (strcmp(def, "none") == 0) { + gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE, + "Not using any cpu extensions"); + + return NULL; + } + + file.fd = sys_openat(AT_FDCWD, PROC_CPUINFO, O_RDONLY); + if (file.fd < 0) { + goto out; + } + file.size = file.pos = 0; + file.eof = file.error = file.skip = _gf_false; + + select = 0; + if (strcmp(def, "auto") != 0) { + while (ec_code_gen_table[select] != NULL) { + if (strcmp(ec_code_gen_table[select]->name, def) == 0) { + break; + } + select++; + } + if (ec_code_gen_table[select] == NULL) { + gf_msg(xl->name, GF_LOG_WARNING, EINVAL, EC_MSG_EXTENSION_UNKNOWN, + "CPU extension '%s' is not known. Not using any cpu " + "extensions", def); + + return NULL; + } + } else { + def = NULL; + } + + while ((line = ec_code_proc_line(&file, &length)) != NULL) { + data = ec_code_proc_split(line, &length, ':'); + if ((data != NULL) && (strcmp(line, "flags") == 0)) { + list = data; + count = 0; + while ((data != NULL) && (*data != 0)) { + count++; + data = ec_code_proc_split(data, &length, ' '); + } + base = select; + select = ec_code_cpu_check(select, list, count); + if ((base != select) && (def != NULL)) { + gf_msg(xl->name, GF_LOG_WARNING, ENOTSUP, + EC_MSG_EXTENSION_UNSUPPORTED, + "CPU extension '%s' is not supported", def); + def = NULL; + } + } + } + + if (file.error) { + gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_EXTENSION_FAILED, + "Unable to detemine supported CPU extensions. Not using any " + "cpu extensions"); + + gen = NULL; + } else { + gen = ec_code_gen_table[select]; + if (gen == NULL) { + gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE, + "Not using any cpu extensions"); + } else { + gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION, + "Using '%s' CPU extensions", gen->name); + } + } + + sys_close(file.fd); + +out: + return gen; +} diff --git a/xlators/cluster/ec/src/ec-code.h b/xlators/cluster/ec/src/ec-code.h new file mode 100644 index 00000000000..355209c3944 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code.h @@ -0,0 +1,44 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_H__ +#define __EC_CODE_H__ + +#include "xlator.h" +#include "list.h" + +#include "ec-types.h" +#include "ec-galois.h" + +ec_code_gen_t * +ec_code_detect(xlator_t *xl, const char *def); + +ec_code_t * +ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen); + +void +ec_code_destroy(ec_code_t *code); + +ec_code_func_linear_t +ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count); +ec_code_func_interleaved_t +ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count); +void +ec_code_release(ec_code_t *code, ec_code_func_t *func); + +void +ec_code_error(ec_code_builder_t *builder, int32_t error); + +void +ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count); + +#endif /* __EC_CODE_H__ */ diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 6e4b975c248..f949dbd0c9f 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -13,7 +13,7 @@ #include "libxlator.h" #include "byte-order.h" -#include "ec-data.h" +#include "ec-types.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 2e6759a2803..fd6bdf7bb9d 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -12,7 +12,7 @@ #include "hashfn.h" #include "ec-mem-types.h" -#include "ec-data.h" +#include "ec-types.h" #include "ec-helpers.h" #include "ec-combine.h" #include "ec-common.h" diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 4a2a11f4ccd..965bc1e25ef 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -11,312 +11,7 @@ #ifndef __EC_DATA_H__ #define __EC_DATA_H__ -#include "xlator.h" - -#include "ec.h" - -struct _ec_config; -typedef struct _ec_config ec_config_t; - -struct _ec_fd; -typedef struct _ec_fd ec_fd_t; - -struct _ec_inode; -typedef struct _ec_inode ec_inode_t; - -union _ec_cbk; -typedef union _ec_cbk ec_cbk_t; - -struct _ec_lock; -typedef struct _ec_lock ec_lock_t; - -struct _ec_lock_link; -typedef struct _ec_lock_link ec_lock_link_t; - -struct _ec_fop_data; -typedef struct _ec_fop_data ec_fop_data_t; - -struct _ec_cbk_data; -typedef struct _ec_cbk_data ec_cbk_data_t; - -struct _ec_heal; -typedef struct _ec_heal ec_heal_t; - -typedef void (* ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t); -typedef int32_t (* ec_handler_f)(ec_fop_data_t *, int32_t); -typedef void (* ec_resume_f)(ec_fop_data_t *, int32_t); - -struct _ec_config -{ - uint32_t version; - uint8_t algorithm; - uint8_t gf_word_size; - uint8_t bricks; - uint8_t redundancy; - uint32_t chunk_size; -}; - -struct _ec_fd -{ - loc_t loc; - uintptr_t open; - int32_t flags; -}; - -struct _ec_inode -{ - ec_lock_t *inode_lock; - gf_boolean_t have_info; - gf_boolean_t have_config; - gf_boolean_t have_version; - gf_boolean_t have_size; - ec_config_t config; - uint64_t pre_version[2]; - uint64_t post_version[2]; - uint64_t pre_size; - uint64_t post_size; - uint64_t dirty[2]; - struct list_head heal; -}; - -typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, - int32_t, int32_t, uintptr_t, uintptr_t, - uintptr_t, dict_t *); -typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, - int32_t, int32_t, uintptr_t, uintptr_t, - uintptr_t, dict_t *); - -union _ec_cbk -{ - fop_access_cbk_t access; - fop_create_cbk_t create; - fop_discard_cbk_t discard; - fop_entrylk_cbk_t entrylk; - fop_fentrylk_cbk_t fentrylk; - fop_fallocate_cbk_t fallocate; - fop_flush_cbk_t flush; - fop_fsync_cbk_t fsync; - fop_fsyncdir_cbk_t fsyncdir; - fop_getxattr_cbk_t getxattr; - fop_fgetxattr_cbk_t fgetxattr; - fop_heal_cbk_t heal; - fop_fheal_cbk_t fheal; - fop_inodelk_cbk_t inodelk; - fop_finodelk_cbk_t finodelk; - fop_link_cbk_t link; - fop_lk_cbk_t lk; - fop_lookup_cbk_t lookup; - fop_mkdir_cbk_t mkdir; - fop_mknod_cbk_t mknod; - fop_open_cbk_t open; - fop_opendir_cbk_t opendir; - fop_readdir_cbk_t readdir; - fop_readdirp_cbk_t readdirp; - fop_readlink_cbk_t readlink; - fop_readv_cbk_t readv; - fop_removexattr_cbk_t removexattr; - fop_fremovexattr_cbk_t fremovexattr; - fop_rename_cbk_t rename; - fop_rmdir_cbk_t rmdir; - fop_setattr_cbk_t setattr; - fop_fsetattr_cbk_t fsetattr; - fop_setxattr_cbk_t setxattr; - fop_fsetxattr_cbk_t fsetxattr; - fop_stat_cbk_t stat; - fop_fstat_cbk_t fstat; - fop_statfs_cbk_t statfs; - fop_symlink_cbk_t symlink; - fop_truncate_cbk_t truncate; - fop_ftruncate_cbk_t ftruncate; - fop_unlink_cbk_t unlink; - fop_writev_cbk_t writev; - fop_xattrop_cbk_t xattrop; - fop_fxattrop_cbk_t fxattrop; - fop_zerofill_cbk_t zerofill; - fop_seek_cbk_t seek; -}; - -struct _ec_lock -{ - ec_inode_t *ctx; - gf_timer_t *timer; - - /* List of owners of this lock. All fops added to this list are running - * concurrently. */ - struct list_head owners; - - /* List of fops waiting to be an owner of the lock. Fops are added to this - * list when the current owner has an incompatible access (shared vs - * exclusive) or the lock is not acquired yet. */ - struct list_head waiting; - - /* List of fops that will wait until the next unlock/lock cycle. This - * happens when the currently acquired lock is decided to be released as - * soon as possible. In this case, all frozen fops will be continued only - * after the lock is reacquired. */ - struct list_head frozen; - - int32_t exclusive; - uintptr_t mask; - uintptr_t good_mask; - uintptr_t healing; - uint32_t refs_owners; /* Refs for fops owning the lock */ - uint32_t refs_pending; /* Refs assigned to fops being prepared */ - gf_boolean_t acquired; - gf_boolean_t getting_size; - gf_boolean_t release; - gf_boolean_t query; - fd_t *fd; - loc_t loc; - union - { - entrylk_type type; - struct gf_flock flock; - }; -}; - -struct _ec_lock_link -{ - ec_lock_t *lock; - ec_fop_data_t *fop; - struct list_head owner_list; - struct list_head wait_list; - gf_boolean_t update[2]; - loc_t *base; - uint64_t size; -}; - -struct _ec_fop_data -{ - int32_t id; - int32_t refs; - int32_t state; - int32_t minimum; - int32_t expected; - int32_t winds; - int32_t jobs; - int32_t error; - ec_fop_data_t *parent; - xlator_t *xl; - call_frame_t *req_frame; /* frame of the calling xlator */ - call_frame_t *frame; /* frame used by this fop */ - struct list_head cbk_list; /* sorted list of groups of answers */ - struct list_head answer_list; /* list of answers */ - struct list_head pending_list; /* member of ec_t.pending_fops */ - ec_cbk_data_t *answer; /* accepted answer */ - int32_t lock_count; - int32_t locked; - ec_lock_link_t locks[2]; - int32_t first_lock; - gf_lock_t lock; - - uint32_t flags; - uint32_t first; - uintptr_t mask; - uintptr_t healing; /*Dispatch is done but call is successful only - if fop->minimum number of subvolumes succeed - which are not healing*/ - uintptr_t remaining; - uintptr_t received; /* Mask of responses */ - uintptr_t good; - - uid_t uid; - gid_t gid; - - ec_wind_f wind; - ec_handler_f handler; - ec_resume_f resume; - ec_cbk_t cbks; - void *data; - ec_heal_t *heal; - struct list_head healer; - - uint64_t user_size; - uint32_t head; - - int32_t use_fd; - - dict_t *xdata; - dict_t *dict; - int32_t int32; - uint32_t uint32; - uint64_t size; - off_t offset; - mode_t mode[2]; - entrylk_cmd entrylk_cmd; - entrylk_type entrylk_type; - gf_xattrop_flags_t xattrop_flags; - dev_t dev; - inode_t *inode; - fd_t *fd; - struct iatt iatt; - char *str[2]; - loc_t loc[2]; - struct gf_flock flock; - struct iovec *vector; - struct iobref *buffers; - gf_seek_what_t seek; -}; - -struct _ec_cbk_data -{ - struct list_head list; // item in the sorted list of groups - struct list_head answer_list; // item in the list of answers - ec_fop_data_t * fop; - ec_cbk_data_t * next; // next answer in the same group - int32_t idx; - int32_t op_ret; - int32_t op_errno; - int32_t count; - uintptr_t mask; - uint64_t dirty[2]; - - dict_t * xdata; - dict_t * dict; - int32_t int32; - uintptr_t uintptr[3]; - uint64_t size; - uint64_t version[2]; - inode_t * inode; - fd_t * fd; - struct statvfs statvfs; - struct iatt iatt[5]; - struct gf_flock flock; - struct iovec * vector; - struct iobref * buffers; - char *str; - gf_dirent_t entries; - off_t offset; - gf_seek_what_t what; -}; - -struct _ec_heal -{ - struct list_head list; - gf_lock_t lock; - xlator_t *xl; - ec_fop_data_t *fop; - void *data; - ec_fop_data_t *lookup; - loc_t loc; - struct iatt iatt; - char *symlink; - fd_t *fd; - int32_t partial; - int32_t done; - int32_t error; - gf_boolean_t nameheal; - uintptr_t available; - uintptr_t good; - uintptr_t bad; - uintptr_t open; - uintptr_t fixed; - uint64_t offset; - uint64_t size; - uint64_t total_size; - uint64_t version[2]; - uint64_t raw_size; -}; +#include "ec-types.h" ec_cbk_data_t * ec_cbk_data_allocate(call_frame_t * frame, xlator_t * this, ec_fop_data_t * fop, int32_t id, @@ -332,4 +27,6 @@ void ec_fop_data_release(ec_fop_data_t * fop); void ec_fop_cleanup(ec_fop_data_t *fop); +void ec_pending_fops_completed(ec_t *ec); + #endif /* __EC_DATA_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index ed53a0416e0..4fe82e3c0b6 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -11,12 +11,13 @@ #include "xlator.h" #include "defaults.h" +#include "ec.h" +#include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" -#include "ec-messages.h" /* FOP: opendir */ diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index e181170650d..1272e3dfe0d 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -11,12 +11,13 @@ #include "xlator.h" #include "defaults.h" +#include "ec.h" +#include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" -#include "ec-messages.h" int ec_dir_write_cbk (call_frame_t *frame, xlator_t *this, diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h index 8d938427a18..13f419b5a30 100644 --- a/xlators/cluster/ec/src/ec-fops.h +++ b/xlators/cluster/ec/src/ec-fops.h @@ -13,7 +13,7 @@ #include "xlator.h" -#include "ec-data.h" +#include "ec-types.h" #include "ec-common.h" void ec_access(call_frame_t * frame, xlator_t * this, uintptr_t target, diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c new file mode 100644 index 00000000000..7dbbac09713 --- /dev/null +++ b/xlators/cluster/ec/src/ec-galois.c @@ -0,0 +1,185 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> + +#include "mem-pool.h" +#include "list.h" + +#include "ec-mem-types.h" +#include "ec-gf8.h" + +static ec_gf_t * +ec_gf_alloc(uint32_t bits, uint32_t mod) +{ + ec_gf_t *gf; + + gf = GF_MALLOC(sizeof(ec_gf_t), ec_mt_ec_gf_t); + if (gf == NULL) { + goto failed; + } + + gf->bits = bits; + gf->size = 1 << bits; + gf->mod = mod; + + gf->log = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1), + gf_common_mt_int); + if (gf->log == NULL) { + goto failed_gf; + } + gf->pow = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1), + gf_common_mt_int); + if (gf->pow == NULL) { + goto failed_log; + } + + return gf; + +failed_log: + GF_FREE(gf->log); +failed_gf: + GF_FREE(gf); +failed: + return NULL; +} + +static void +ec_gf_init_tables(ec_gf_t *gf) +{ + uint32_t i, tmp; + + memset(gf->log, -1, sizeof(uint32_t) * gf->size); + + gf->pow[0] = 1; + gf->log[0] = gf->size; + gf->log[1] = 0; + for (i = 1; i < gf->size; i++) { + tmp = gf->pow[i - 1] << 1; + if (tmp >= gf->size) { + tmp ^= gf->mod; + } + gf->pow[i + gf->size - 1] = gf->pow[i] = tmp; + gf->log[tmp + gf->size - 1] = gf->log[tmp] = i; + } +} + +ec_gf_t * +ec_gf_prepare(uint32_t bits, uint32_t mod) +{ + ec_gf_mul_t **tbl; + ec_gf_t *gf; + uint32_t i, j; + + if (bits != 8) { + return NULL; + } + + tbl = ec_gf8_mul; + if (mod == 0) { + mod = 0x11d; + } + + gf = ec_gf_alloc(bits, mod); + if (gf == NULL) { + return NULL; + } + ec_gf_init_tables(gf); + + gf->table = tbl; + gf->min_ops = bits * bits; + gf->max_ops = 0; + gf->avg_ops = 0; + for (i = 1; i < gf->size; i++) { + for (j = 0; tbl[i]->ops[j].op != EC_GF_OP_END; j++) { + } + if (gf->max_ops < j) { + gf->max_ops = j; + } + if (gf->min_ops > j) { + gf->min_ops = j; + } + gf->avg_ops += j; + } + gf->avg_ops /= gf->size; + + return gf; +} + +void +ec_gf_destroy(ec_gf_t *gf) +{ + GF_FREE(gf->pow); + GF_FREE(gf->log); + GF_FREE(gf); +} + +uint32_t +ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + if ((a >= gf->size) || (b >= gf->size)) { + return gf->size; + } + + return a ^ b; +} + +uint32_t +ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + if ((a >= gf->size) || (b >= gf->size)) { + return gf->size; + } + + if ((a != 0) && (b != 0)) { + return gf->pow[gf->log[a] + gf->log[b]]; + } + + return 0; +} + +uint32_t +ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + if ((a >= gf->size) || (b >= gf->size)) { + return gf->size; + } + + if (b != 0) { + if (a != 0) { + return gf->pow[gf->size - 1 + gf->log[a] - gf->log[b]]; + } + + return 0; + } + + return gf->size; +} + +uint32_t +ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + uint32_t r; + + if ((a >= gf->size) || ((a == 0) && (b == 0))) { + return gf->size; + } + + r = 1; + while (b != 0) { + if ((b & 1) != 0) { + r = ec_gf_mul(gf, r, a); + } + a = ec_gf_mul(gf, a, a); + b >>= 1; + } + + return r; +} diff --git a/xlators/cluster/ec/src/ec-galois.h b/xlators/cluster/ec/src/ec-galois.h new file mode 100644 index 00000000000..02e6b6c1bc2 --- /dev/null +++ b/xlators/cluster/ec/src/ec-galois.h @@ -0,0 +1,26 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_GALOIS_H__ +#define __EC_GALOIS_H__ + +#include <inttypes.h> + +#include "ec-types.h" + +ec_gf_t *ec_gf_prepare(uint32_t bits, uint32_t mod); +void ec_gf_destroy(ec_gf_t *gf); + +uint32_t ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b); + +#endif /* __EC_GALOIS_H__ */ diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 0ad514908aa..d67420469a8 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -10,14 +10,15 @@ #include "xlator.h" #include "defaults.h" +#include "byte-order.h" +#include "ec.h" +#include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" -#include "ec-messages.h" -#include "byte-order.h" /* FOP: flush */ diff --git a/xlators/cluster/ec/src/ec-gf.c b/xlators/cluster/ec/src/ec-gf.c deleted file mode 100644 index 1ae8928f20b..00000000000 --- a/xlators/cluster/ec/src/ec-gf.c +++ /dev/null @@ -1,11635 +0,0 @@ -/* - Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include <inttypes.h> -#include <string.h> - -#include "ec-gf.h" - -static void gf8_muladd_00(uint8_t * out, uint8_t * in, unsigned int width) -{ - memcpy(out, in, sizeof(uint64_t) * 8 * width); -} - -static void gf8_muladd_01(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - out_ptr[0] ^= in_ptr[0]; - out_ptr[width] ^= in_ptr[width]; - out_ptr[width * 2] ^= in_ptr[width * 2]; - out_ptr[width * 3] ^= in_ptr[width * 3]; - out_ptr[width * 4] ^= in_ptr[width * 4]; - out_ptr[width * 5] ^= in_ptr[width * 5]; - out_ptr[width * 6] ^= in_ptr[width * 6]; - out_ptr[width * 7] ^= in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_02(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in7; - out1 = in0; - out7 = in6; - out5 = in4; - out6 = in5; - out3 = in2 ^ in7; - out4 = in3 ^ in7; - out2 = in1 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_03(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in0 ^ in7; - tmp0 = in2 ^ in7; - out1 = in0 ^ in1; - out7 = in6 ^ in7; - out5 = in4 ^ in5; - out6 = in5 ^ in6; - out4 = in3 ^ in4 ^ in7; - out2 = tmp0 ^ in1; - out3 = tmp0 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_04(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in6; - out1 = in7; - out7 = in5; - out6 = in4; - tmp0 = in6 ^ in7; - out2 = in0 ^ in6; - out5 = in3 ^ in7; - out3 = tmp0 ^ in1; - out4 = tmp0 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_05(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in0 ^ in6; - out1 = in1 ^ in7; - out7 = in5 ^ in7; - out6 = in4 ^ in6; - out2 = out0 ^ in2; - out3 = out1 ^ in3 ^ in6; - out5 = out7 ^ in3; - out4 = out6 ^ in2 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_06(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in6 ^ in7; - tmp0 = in1 ^ in6; - out1 = in0 ^ in7; - out7 = in5 ^ in6; - out6 = in4 ^ in5; - out4 = in2 ^ in3 ^ in6; - out5 = in3 ^ in4 ^ in7; - out3 = tmp0 ^ in2; - out2 = tmp0 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_07(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in6; - tmp1 = in5 ^ in6; - tmp2 = in0 ^ in7; - tmp3 = tmp0 ^ in3; - out6 = tmp1 ^ in4; - out7 = tmp1 ^ in7; - out0 = tmp2 ^ in6; - out1 = tmp2 ^ in1; - out3 = tmp3 ^ in1; - out4 = tmp3 ^ in4; - out5 = out4 ^ out7 ^ in2; - out2 = tmp0 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_08(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in5; - out1 = in6; - out7 = in4; - out6 = in3 ^ in7; - out3 = in0 ^ in5 ^ in6; - out5 = in2 ^ in6 ^ in7; - out2 = in5 ^ in7; - out4 = out2 ^ in1 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_09(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in0 ^ in5; - tmp0 = in3 ^ in6; - out1 = in1 ^ in6; - out7 = in4 ^ in7; - out2 = in2 ^ in5 ^ in7; - out3 = tmp0 ^ out0; - out6 = tmp0 ^ in7; - out4 = out1 ^ out7 ^ in5; - out5 = out2 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_0A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in5 ^ in7; - out1 = in0 ^ in6; - out7 = in4 ^ in6; - out2 = in1 ^ in5; - out6 = out0 ^ in3; - out3 = out0 ^ out1 ^ in2; - out5 = out7 ^ in2 ^ in7; - out4 = out2 ^ in3 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_0B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - tmp1 = in0 ^ in6; - tmp2 = in4 ^ in7; - out0 = in0 ^ in5 ^ in7; - out2 = tmp0 ^ in1; - out1 = tmp1 ^ in1; - out6 = tmp1 ^ out0 ^ in3; - out7 = tmp2 ^ in6; - out4 = tmp2 ^ out6 ^ in1; - out3 = out6 ^ in0 ^ in2; - out5 = tmp0 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_0C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in5 ^ in6; - out1 = in6 ^ in7; - out7 = in4 ^ in5; - tmp0 = in1 ^ in5; - tmp1 = in0 ^ in7; - out5 = in2 ^ in3 ^ in6; - out6 = in3 ^ in4 ^ in7; - out2 = tmp1 ^ out0; - out4 = tmp0 ^ in2; - out3 = tmp0 ^ tmp1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_0D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in5; - tmp1 = in5 ^ in6; - out1 = in1 ^ in6 ^ in7; - out7 = tmp0 ^ in7; - out4 = tmp0 ^ in1 ^ in2; - out0 = tmp1 ^ in0; - tmp2 = tmp1 ^ in3; - out6 = tmp2 ^ out7; - out2 = out0 ^ in2 ^ in7; - out3 = out0 ^ out1 ^ in3; - out5 = tmp2 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_0E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in2 ^ in5; - tmp2 = in5 ^ in6; - out1 = in0 ^ in6 ^ in7; - out3 = tmp0 ^ tmp1; - out2 = tmp0 ^ tmp2; - tmp3 = tmp1 ^ in3; - out7 = tmp2 ^ in4; - out0 = tmp2 ^ in7; - out4 = tmp3 ^ in1 ^ in7; - out5 = tmp3 ^ out7; - out6 = out0 ^ out5 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_0F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in6 ^ in7; - tmp1 = tmp0 ^ in1; - tmp2 = tmp0 ^ in5; - out1 = tmp1 ^ in0; - out7 = tmp2 ^ in4; - out0 = tmp2 ^ in0; - out6 = out7 ^ in3; - out5 = out6 ^ in2 ^ in7; - tmp3 = tmp1 ^ out0 ^ in2; - out4 = tmp1 ^ out5; - out2 = tmp3 ^ in6; - out3 = tmp3 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_10(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in4; - out1 = in5; - out7 = in3 ^ in7; - tmp0 = in6 ^ in7; - out2 = in4 ^ in6; - tmp1 = out2 ^ in5; - out6 = tmp0 ^ in2; - out3 = tmp0 ^ tmp1; - out5 = out2 ^ out3 ^ in1; - out4 = tmp1 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_11(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out7 = in3; - out0 = in0 ^ in4; - out1 = in1 ^ in5; - out6 = in2 ^ in7; - out4 = in0 ^ in5 ^ in6; - out5 = in1 ^ in6 ^ in7; - out2 = in2 ^ in4 ^ in6; - out3 = in3 ^ in4 ^ in5 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_12(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in4 ^ in7; - out1 = in0 ^ in5; - out3 = in2 ^ in4 ^ in5; - tmp0 = out0 ^ in6; - out2 = tmp0 ^ in1; - tmp1 = tmp0 ^ in3; - out6 = tmp0 ^ out3; - out5 = out2 ^ in5; - out7 = tmp1 ^ in4; - out4 = tmp1 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_13(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out7 = in3 ^ in6; - tmp0 = in0 ^ in5; - tmp1 = in4 ^ in7; - out6 = in2 ^ in5 ^ in7; - out4 = tmp0 ^ out7 ^ in7; - out1 = tmp0 ^ in1; - out0 = tmp1 ^ in0; - out5 = tmp1 ^ in1 ^ in6; - out3 = tmp1 ^ out6 ^ in3; - out2 = out5 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_14(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in4 ^ in6; - out1 = in5 ^ in7; - out2 = in0 ^ in4; - tmp0 = out0 ^ in5; - out7 = out1 ^ in3; - tmp1 = out1 ^ in2; - out3 = tmp0 ^ in1; - out6 = tmp0 ^ tmp1; - out4 = tmp1 ^ out2; - out5 = out3 ^ in3 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_15(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out7 = in3 ^ in5; - tmp0 = in0 ^ in4; - out1 = in1 ^ in5 ^ in7; - out5 = in1 ^ in3 ^ in6; - out0 = tmp0 ^ in6; - out2 = tmp0 ^ in2; - out3 = out5 ^ in4 ^ in5; - out6 = out2 ^ in0 ^ in7; - out4 = tmp0 ^ out6 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_16(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in5; - tmp1 = in4 ^ in7; - tmp2 = in2 ^ in3 ^ in4; - out1 = tmp0 ^ in7; - out4 = tmp0 ^ tmp2; - out0 = tmp1 ^ in6; - tmp3 = tmp1 ^ in1; - out6 = out0 ^ in2 ^ in5; - out2 = tmp3 ^ in0; - out3 = out6 ^ in1; - out7 = tmp2 ^ out6; - out5 = tmp3 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_17(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - tmp1 = in3 ^ in6; - tmp2 = tmp0 ^ in4; - out4 = tmp0 ^ in0 ^ in3; - out7 = tmp1 ^ in5; - tmp3 = tmp1 ^ in1; - out6 = tmp2 ^ in7; - out5 = tmp3 ^ in4; - out3 = tmp3 ^ out6; - out0 = out3 ^ out4 ^ in1; - out2 = out3 ^ out7 ^ in0; - out1 = tmp2 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_18(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in4 ^ in5; - out1 = in5 ^ in6; - tmp0 = in4 ^ in7; - out5 = in1 ^ in2 ^ in5; - out6 = in2 ^ in3 ^ in6; - out2 = tmp0 ^ out1; - out7 = tmp0 ^ in3; - tmp1 = tmp0 ^ in0; - out3 = tmp1 ^ in6; - out4 = tmp1 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_19(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in1 ^ in2; - out7 = in3 ^ in4; - tmp0 = in0 ^ in7; - out6 = in2 ^ in3; - out1 = in1 ^ in5 ^ in6; - out0 = in0 ^ in4 ^ in5; - out4 = tmp0 ^ in1; - tmp1 = tmp0 ^ in6; - out2 = tmp1 ^ out0 ^ in2; - out3 = tmp1 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_1A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in5; - tmp1 = in5 ^ in6; - tmp2 = tmp0 ^ in1; - out0 = tmp0 ^ in7; - out1 = tmp1 ^ in0; - tmp3 = tmp1 ^ in3; - out5 = tmp2 ^ in2; - out2 = tmp2 ^ in6; - out7 = tmp3 ^ out0; - out6 = tmp3 ^ in2; - out4 = tmp3 ^ out2 ^ in0; - out3 = tmp0 ^ out1 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_1B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in4; - tmp1 = in2 ^ in5; - tmp2 = in3 ^ in6; - out5 = tmp0 ^ in1; - tmp3 = tmp0 ^ in0; - out6 = tmp1 ^ in3; - out0 = tmp1 ^ tmp3 ^ in7; - out7 = tmp2 ^ in4; - tmp4 = out5 ^ in6; - out3 = tmp2 ^ tmp3; - out2 = tmp4 ^ in5; - out4 = tmp4 ^ out3; - out1 = tmp3 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_1C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - tmp1 = in4 ^ in6; - tmp2 = in5 ^ in7; - out6 = tmp0 ^ tmp1; - out0 = tmp1 ^ in5; - out1 = tmp2 ^ in6; - tmp3 = tmp2 ^ in1; - tmp4 = tmp2 ^ in4; - out2 = tmp4 ^ in0; - out7 = tmp4 ^ in3; - out5 = tmp0 ^ tmp3; - out3 = tmp3 ^ out2; - out4 = out3 ^ in2 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_1D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in3; - tmp1 = in0 ^ in4; - tmp2 = in3 ^ in4; - tmp3 = in2 ^ in7; - out3 = tmp0 ^ tmp1; - out5 = tmp0 ^ tmp3; - tmp4 = tmp1 ^ in5; - out6 = tmp2 ^ in2; - out7 = tmp2 ^ in5; - out2 = tmp3 ^ tmp4; - out4 = out3 ^ out6 ^ in6; - out0 = tmp4 ^ in6; - out1 = out2 ^ out4 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_1E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in4; - tmp1 = in2 ^ in7; - tmp2 = tmp0 ^ in1; - out3 = tmp1 ^ tmp2; - out2 = tmp2 ^ in5; - out4 = out3 ^ in3 ^ in6; - tmp3 = out4 ^ in7; - out6 = tmp3 ^ out2 ^ in4; - out7 = tmp1 ^ out6; - out0 = out7 ^ in3; - out1 = tmp0 ^ out0; - out5 = tmp3 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_1F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in6; - tmp1 = tmp0 ^ in5; - out7 = tmp1 ^ in3; - out0 = tmp1 ^ in0 ^ in7; - out6 = out7 ^ in2 ^ in6; - out1 = out0 ^ in1 ^ in4; - out4 = out0 ^ out6 ^ in1; - out3 = tmp0 ^ out4; - out2 = out4 ^ out7 ^ in7; - out5 = out3 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_20(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in4; - out0 = in3 ^ in7; - tmp0 = in3 ^ in4; - tmp1 = in6 ^ in7; - out2 = out0 ^ in5; - out4 = tmp0 ^ in5; - out3 = tmp0 ^ tmp1; - out7 = tmp1 ^ in2; - out6 = tmp1 ^ in1 ^ in5; - out5 = out2 ^ out3 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_21(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in1 ^ in4; - tmp0 = in4 ^ in6; - out4 = in3 ^ in5; - out7 = in2 ^ in6; - out0 = in0 ^ in3 ^ in7; - out6 = in1 ^ in5 ^ in7; - out3 = tmp0 ^ in7; - out5 = tmp0 ^ in0; - out2 = out4 ^ in2 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_22(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in3; - out1 = in0 ^ in4; - out7 = in2 ^ in7; - out4 = in4 ^ in5 ^ in7; - out5 = in0 ^ in5 ^ in6; - out6 = in1 ^ in6 ^ in7; - out3 = in2 ^ in3 ^ in4 ^ in6; - out2 = in1 ^ in3 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_23(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out7 = in2; - out0 = in0 ^ in3; - out4 = in5 ^ in7; - out5 = in0 ^ in6; - out6 = in1 ^ in7; - out3 = in2 ^ in4 ^ in6; - out1 = in0 ^ in1 ^ in4; - out2 = out4 ^ out6 ^ in2 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_24(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in4 ^ in7; - tmp0 = in3 ^ in4; - out0 = in3 ^ in6 ^ in7; - out3 = tmp0 ^ in1; - tmp1 = out0 ^ in5; - out6 = tmp1 ^ out3; - out2 = tmp1 ^ in0; - out7 = tmp1 ^ in2 ^ in3; - out5 = out2 ^ in4; - out4 = tmp0 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_25(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in1 ^ in4; - tmp0 = in2 ^ in5; - out1 = out3 ^ in7; - out7 = tmp0 ^ in6; - out6 = out1 ^ in5; - out4 = out7 ^ in3 ^ in7; - out2 = out4 ^ in0; - out0 = tmp0 ^ out2; - out5 = out0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_26(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in3 ^ in6; - tmp0 = in4 ^ in7; - out7 = in2 ^ in5 ^ in7; - tmp1 = out0 ^ in0 ^ in5; - out1 = tmp0 ^ in0; - tmp2 = tmp0 ^ in6; - out2 = tmp1 ^ in1; - out5 = tmp1 ^ in7; - out6 = tmp2 ^ in1; - out4 = tmp2 ^ out7; - out3 = out0 ^ out6 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_27(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out7 = in2 ^ in5; - out0 = in0 ^ in3 ^ in6; - out6 = in1 ^ in4 ^ in7; - out4 = out7 ^ in6; - out2 = out0 ^ out7 ^ in1; - out5 = out0 ^ in7; - out1 = out6 ^ in0; - out3 = out6 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_28(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in3; - out1 = in4 ^ in6; - out0 = in3 ^ in5 ^ in7; - tmp0 = out1 ^ in7; - tmp1 = out0 ^ in4; - out7 = tmp0 ^ in2; - tmp2 = tmp0 ^ in1; - out3 = tmp1 ^ in0; - out6 = tmp1 ^ tmp2; - out4 = tmp2 ^ in3; - out5 = out3 ^ in2 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_29(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in2 ^ in3; - tmp0 = in1 ^ in3; - tmp1 = in4 ^ in6; - tmp2 = in0 ^ in4 ^ in7; - out6 = tmp0 ^ in5; - out4 = tmp0 ^ in6 ^ in7; - out1 = tmp1 ^ in1; - out7 = tmp1 ^ in2; - out3 = tmp2 ^ in5; - out5 = tmp2 ^ in2; - out0 = out3 ^ in3 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_2A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in3 ^ in5; - tmp0 = in1 ^ in3; - tmp1 = in0 ^ in4; - out7 = in2 ^ in4 ^ in7; - out3 = tmp1 ^ out0 ^ in2; - out2 = tmp0 ^ in7; - out6 = tmp0 ^ in6; - out1 = tmp1 ^ in6; - out5 = tmp1 ^ out7 ^ in5; - out4 = out1 ^ in0 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_2B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in1 ^ in6; - out7 = in2 ^ in4; - tmp0 = in0 ^ in5; - tmp1 = in2 ^ in7; - out6 = in1 ^ in3; - out1 = out4 ^ in0 ^ in4; - out3 = tmp0 ^ out7; - out0 = tmp0 ^ in3; - out5 = tmp1 ^ in0; - out2 = tmp1 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_2C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - tmp1 = in2 ^ in3 ^ in4; - tmp2 = tmp0 ^ in6; - out4 = tmp1 ^ in1; - out5 = tmp1 ^ in0 ^ in5; - tmp3 = tmp2 ^ in4; - out6 = tmp2 ^ out4; - out7 = tmp3 ^ in7; - out2 = tmp3 ^ out5; - out3 = out6 ^ in0; - out0 = tmp1 ^ out7; - out1 = tmp0 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_2D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - out4 = tmp0 ^ in1; - tmp1 = tmp0 ^ in0; - out2 = tmp1 ^ in6; - out5 = tmp1 ^ in4; - tmp2 = out2 ^ in2; - tmp3 = tmp2 ^ in5; - out0 = tmp3 ^ in7; - out7 = tmp3 ^ out5; - out6 = out4 ^ out7 ^ in6; - out3 = tmp2 ^ out6; - out1 = out0 ^ out6 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_2E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in7; - out0 = in3 ^ in5 ^ in6; - tmp1 = tmp0 ^ in0; - tmp2 = tmp0 ^ in2; - out1 = tmp1 ^ in6; - out4 = tmp2 ^ in1; - out7 = tmp2 ^ in5; - out3 = out0 ^ out4 ^ in0; - out2 = out3 ^ out7 ^ in7; - out6 = tmp1 ^ out2; - out5 = tmp1 ^ out7 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_2F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = in2 ^ in5; - out4 = in1 ^ in2 ^ in7; - out6 = in1 ^ in3 ^ in4; - out5 = tmp0 ^ in2; - tmp2 = tmp0 ^ in6; - out7 = tmp1 ^ in4; - out0 = tmp2 ^ in5; - out2 = tmp2 ^ out4; - out1 = tmp2 ^ out6 ^ in7; - out3 = tmp1 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_30(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in4 ^ in5; - tmp0 = in3 ^ in6; - tmp1 = in4 ^ in7; - out6 = in1 ^ in2 ^ in5; - out3 = tmp0 ^ in5; - out4 = tmp0 ^ in0; - out7 = tmp0 ^ in2; - out0 = tmp1 ^ in3; - out2 = tmp1 ^ out3; - out5 = tmp1 ^ in0 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_31(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in5 ^ in6; - tmp0 = in4 ^ in5; - tmp1 = in0 ^ in3 ^ in4; - tmp2 = out3 ^ in2; - out1 = tmp0 ^ in1; - out0 = tmp1 ^ in7; - out4 = tmp1 ^ in6; - out6 = tmp2 ^ in1; - out2 = tmp2 ^ out0 ^ in0; - out5 = out1 ^ in0 ^ in7; - out7 = tmp0 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_32(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in3 ^ in4; - out7 = in2 ^ in3; - tmp0 = in5 ^ in6; - tmp1 = in0 ^ in7; - out6 = in1 ^ in2; - out1 = in0 ^ in4 ^ in5; - out2 = tmp0 ^ out0 ^ in1; - out3 = tmp0 ^ out7 ^ in7; - out4 = tmp1 ^ in6; - out5 = tmp1 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_33(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - tmp1 = in0 ^ in4; - tmp2 = in1 ^ in5; - out6 = in1 ^ in2 ^ in6; - out7 = tmp0 ^ in7; - out0 = tmp1 ^ in3; - out1 = tmp1 ^ tmp2; - tmp3 = tmp2 ^ in7; - tmp4 = tmp2 ^ in4 ^ in6; - out5 = tmp3 ^ in0; - out3 = tmp3 ^ out6; - out4 = tmp4 ^ out5; - out2 = tmp0 ^ tmp4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_34(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in4; - tmp1 = in4 ^ in5; - tmp2 = tmp0 ^ in1; - tmp3 = tmp0 ^ in6; - out1 = tmp1 ^ in7; - tmp4 = tmp1 ^ in2; - out5 = tmp2 ^ in0; - out3 = tmp2 ^ out1; - out0 = tmp3 ^ in7; - out7 = tmp3 ^ tmp4; - out6 = tmp4 ^ in1; - out2 = out3 ^ out5 ^ in3; - out4 = tmp4 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_35(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in6; - tmp1 = in5 ^ in7; - out7 = tmp0 ^ tmp1 ^ in3; - out3 = tmp1 ^ in1; - out1 = out3 ^ in4; - tmp2 = out1 ^ in7; - out5 = tmp2 ^ in0 ^ in3; - out6 = tmp0 ^ tmp2; - out0 = out3 ^ out5 ^ in6; - out4 = tmp0 ^ out0; - out2 = out4 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_36(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in0 ^ in2; - tmp0 = in1 ^ in3; - out0 = in3 ^ in4 ^ in6; - out6 = in1 ^ in2 ^ in4; - out5 = tmp0 ^ in0; - tmp1 = out5 ^ in5; - out2 = tmp1 ^ in4; - out3 = tmp1 ^ out4; - out1 = tmp0 ^ out2 ^ in7; - out7 = out3 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_37(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = in2 ^ in4; - tmp2 = tmp0 ^ in6; - out3 = tmp0 ^ in5; - out4 = tmp1 ^ in0; - out6 = tmp2 ^ in4; - out1 = out3 ^ out4 ^ in7; - tmp3 = out4 ^ in1 ^ in3; - out7 = tmp3 ^ out1; - out2 = tmp3 ^ in5; - out5 = tmp1 ^ out2; - out0 = tmp2 ^ tmp3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_38(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0 ^ in3; - tmp0 = in3 ^ in4; - tmp1 = in5 ^ in7; - tmp2 = out3 ^ in1; - out2 = tmp0 ^ in6; - out0 = tmp0 ^ tmp1; - out4 = tmp1 ^ tmp2; - out7 = out2 ^ in2; - out1 = out2 ^ in3 ^ in5; - out6 = out4 ^ in0 ^ in2; - out5 = tmp2 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_39(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0; - tmp0 = in1 ^ in5; - tmp1 = tmp0 ^ in4; - out1 = tmp1 ^ in6; - out5 = out1 ^ in0 ^ in2; - tmp2 = tmp0 ^ out5; - out2 = tmp2 ^ in0 ^ in3; - out7 = out2 ^ in7; - out6 = tmp1 ^ out7; - out4 = tmp2 ^ out6; - out0 = out4 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_3A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in0 ^ in2; - tmp2 = in3 ^ in4; - tmp3 = in1 ^ in6; - tmp4 = in3 ^ in7; - out4 = tmp0 ^ in5; - out5 = tmp1 ^ tmp3; - out3 = tmp1 ^ tmp4; - out0 = tmp2 ^ in5; - out7 = tmp2 ^ in2; - tmp5 = tmp3 ^ in4; - out2 = tmp4 ^ tmp5; - out1 = tmp5 ^ out4; - out6 = tmp0 ^ out3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_3B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in6; - tmp1 = in2 ^ in7; - tmp2 = tmp0 ^ in3; - out3 = tmp1 ^ in0; - out6 = tmp1 ^ tmp2; - out2 = out6 ^ in4; - out7 = tmp0 ^ out2; - out0 = out3 ^ out7 ^ in5; - out5 = out0 ^ out2 ^ in7; - out1 = tmp2 ^ out0; - out4 = out1 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_3C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = in2 ^ in7; - tmp2 = in1 ^ in6 ^ in7; - out2 = tmp0 ^ in4; - out3 = tmp0 ^ tmp2; - out4 = tmp1 ^ out3 ^ in5; - out5 = tmp2 ^ out2 ^ in2; - out1 = out4 ^ out5 ^ in6; - out0 = out1 ^ in3; - out7 = tmp1 ^ out0; - out6 = tmp2 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_3D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in2; - tmp1 = tmp0 ^ in3; - out2 = tmp1 ^ in4; - tmp2 = out2 ^ in5; - out4 = tmp2 ^ in1 ^ in6; - out5 = out4 ^ in7; - out6 = out5 ^ in0; - out7 = out6 ^ in1; - out0 = tmp0 ^ out7; - out1 = tmp1 ^ out5; - out3 = tmp2 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_3E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in5; - tmp1 = tmp0 ^ in4; - out0 = tmp1 ^ in6; - out7 = tmp1 ^ in2; - out6 = out7 ^ in1 ^ in5 ^ in7; - out2 = out6 ^ in0 ^ in2; - out4 = out0 ^ out6 ^ in0; - out5 = tmp0 ^ out4; - out3 = out5 ^ in7; - out1 = out3 ^ out6 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_3F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - out3 = tmp0 ^ in2 ^ in6; - tmp1 = out3 ^ in5 ^ in7; - out4 = tmp1 ^ in4; - out5 = tmp1 ^ in3; - out1 = out4 ^ in2; - out7 = out1 ^ out3 ^ in3; - out2 = tmp0 ^ out7 ^ in5; - tmp2 = out2 ^ in0; - out6 = tmp2 ^ in6; - out0 = tmp1 ^ tmp2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_40(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in3 ^ in7; - tmp0 = in3 ^ in4; - tmp1 = in6 ^ in7; - out4 = tmp0 ^ in2; - out5 = tmp0 ^ in5; - out0 = tmp1 ^ in2; - out7 = tmp1 ^ in1 ^ in5; - out2 = out0 ^ in4; - out3 = out2 ^ out5 ^ in7; - out6 = out3 ^ out4 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_41(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in2 ^ in3; - tmp0 = in5 ^ in6; - tmp1 = in6 ^ in7; - out5 = in3 ^ in4; - out1 = in1 ^ in3 ^ in7; - out6 = in0 ^ in4 ^ in5; - out3 = tmp0 ^ in2; - out7 = tmp0 ^ in1; - out2 = tmp1 ^ in4; - out0 = tmp1 ^ in0 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_42(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in2 ^ in6; - out5 = in3 ^ in5; - out1 = in0 ^ in3 ^ in7; - out7 = in1 ^ in5 ^ in7; - out4 = in2 ^ in4 ^ in7; - out6 = in0 ^ in4 ^ in6; - out2 = out0 ^ in1 ^ in4; - out3 = out5 ^ in6 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_43(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in3; - out7 = in1 ^ in5; - out4 = in2 ^ in7; - out6 = in0 ^ in4; - out0 = in0 ^ in2 ^ in6; - out3 = in5 ^ in6 ^ in7; - out2 = in1 ^ in4 ^ in6; - out1 = in0 ^ in1 ^ in3 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_44(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in3; - out0 = in2 ^ in7; - tmp0 = in4 ^ in7; - out7 = in1 ^ in6 ^ in7; - out6 = in0 ^ in5 ^ in6; - out4 = tmp0 ^ in3 ^ in6; - out3 = out0 ^ in1 ^ in3 ^ in5; - out2 = out0 ^ in0 ^ in4; - out5 = tmp0 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_45(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in1 ^ in3; - out7 = in1 ^ in6; - out5 = in4 ^ in7; - out6 = in0 ^ in5; - out0 = in0 ^ in2 ^ in7; - out4 = in3 ^ in6 ^ in7; - out2 = out5 ^ in0; - out3 = out0 ^ out6 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_46(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in2; - out1 = in0 ^ in3; - out7 = in1 ^ in7; - out4 = in4 ^ in6; - out5 = in5 ^ in7; - out6 = in0 ^ in6; - out3 = in1 ^ in3 ^ in5; - out2 = out4 ^ out6 ^ in1 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_47(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in6; - out7 = in1; - out5 = in7; - out6 = in0; - tmp0 = in0 ^ in1; - out3 = in1 ^ in5; - out0 = in0 ^ in2; - out1 = tmp0 ^ in3; - out2 = tmp0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_48(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - out1 = in3 ^ in6 ^ in7; - out3 = tmp0 ^ in0; - out0 = tmp0 ^ out1 ^ in5; - tmp1 = out0 ^ in4; - out2 = tmp1 ^ in7; - out5 = tmp1 ^ in3; - out4 = out5 ^ in1; - out7 = tmp0 ^ out4; - out6 = tmp1 ^ out3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_49(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0 ^ in2; - tmp0 = in2 ^ in5; - out2 = in4 ^ in5 ^ in6; - tmp1 = tmp0 ^ out2 ^ in3; - out7 = out2 ^ in1; - out5 = tmp1 ^ in7; - out4 = out5 ^ out7 ^ in6; - out1 = tmp0 ^ out4; - out6 = out1 ^ out7 ^ in0; - out0 = tmp1 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_4A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in6; - tmp1 = in3 ^ in7; - out0 = tmp0 ^ in5; - out3 = tmp1 ^ in0; - out5 = tmp1 ^ out0; - out4 = out0 ^ in1 ^ in4; - out1 = out3 ^ in6; - out2 = out4 ^ in7; - out6 = out1 ^ in4; - out7 = tmp0 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_4B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0 ^ in7; - tmp0 = in1 ^ in5; - tmp1 = in2 ^ in6; - tmp2 = out3 ^ in3; - out7 = tmp0 ^ in4; - out4 = tmp0 ^ tmp1; - tmp3 = tmp1 ^ in0; - out6 = tmp2 ^ in4; - out5 = tmp2 ^ tmp3; - out1 = tmp2 ^ in1 ^ in6; - out2 = out7 ^ in6 ^ in7; - out0 = tmp3 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_4C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in3 ^ in6; - tmp0 = in2 ^ in5; - tmp1 = out1 ^ in5 ^ in7; - out0 = tmp0 ^ in7; - tmp2 = tmp0 ^ in4; - out6 = tmp1 ^ in0; - out2 = tmp2 ^ in0; - out5 = tmp2 ^ in6; - out3 = tmp0 ^ out6 ^ in1; - out7 = out0 ^ out5 ^ in1; - out4 = tmp1 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_4D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in5; - tmp1 = in1 ^ in6; - out4 = in1 ^ in3 ^ in5; - tmp2 = tmp0 ^ in7; - out2 = tmp0 ^ in4; - out1 = tmp1 ^ in3; - out7 = tmp1 ^ in4; - out0 = tmp2 ^ in2; - out6 = tmp2 ^ in3; - out5 = out7 ^ in1 ^ in2; - out3 = tmp1 ^ out0 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_4E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in2 ^ in5; - out7 = in1 ^ in4 ^ in7; - out1 = in0 ^ in3 ^ in6; - out5 = out0 ^ in6; - out4 = out7 ^ in5; - out3 = out1 ^ in1; - out6 = out1 ^ in7; - out2 = out4 ^ in0 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_4F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in2 ^ in6; - out7 = in1 ^ in4; - out3 = in0 ^ in1 ^ in6; - out4 = in1 ^ in5 ^ in7; - out0 = in0 ^ in2 ^ in5; - out6 = in0 ^ in3 ^ in7; - out1 = out3 ^ in3; - out2 = out4 ^ in0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_50(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in2 ^ in7; - tmp0 = in3 ^ in5; - out0 = out2 ^ in4 ^ in6; - out1 = tmp0 ^ in7; - tmp1 = tmp0 ^ in6; - out3 = out0 ^ in3; - out7 = tmp1 ^ in1; - tmp2 = tmp1 ^ in0; - out5 = out3 ^ in1 ^ in2; - out4 = tmp2 ^ in2; - out6 = tmp2 ^ out3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_51(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in7; - out3 = in2 ^ in4 ^ in6 ^ in7; - out0 = out3 ^ in0; - out6 = out0 ^ in5; - out4 = out6 ^ in3 ^ in7; - out1 = out0 ^ out4 ^ in1; - out7 = out1 ^ in6; - out5 = out7 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_52(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in1 ^ in2; - tmp0 = in2 ^ in4; - tmp1 = in3 ^ in5; - tmp2 = in3 ^ in6; - tmp3 = in0 ^ in7; - out0 = tmp0 ^ in6; - out6 = tmp0 ^ tmp3; - out7 = tmp1 ^ in1; - out1 = tmp1 ^ tmp3; - out3 = tmp2 ^ in4; - out5 = tmp2 ^ in1 ^ in7; - out4 = tmp2 ^ out1 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_53(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in1; - out3 = in4 ^ in6; - out0 = out3 ^ in0 ^ in2; - out6 = out0 ^ in7; - out4 = out6 ^ in5; - out7 = out0 ^ out4 ^ in1 ^ in3; - out1 = out7 ^ in0; - out5 = out7 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_54(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in3 ^ in5; - tmp0 = in1 ^ in3; - tmp1 = in2 ^ in4; - tmp2 = in0 ^ in7; - out5 = in1 ^ in4 ^ in6; - out4 = tmp2 ^ out1; - out7 = tmp0 ^ in6; - out3 = tmp0 ^ tmp1; - out0 = tmp1 ^ in7; - tmp3 = tmp2 ^ in2; - out2 = tmp3 ^ in6; - out6 = tmp3 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_55(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in3; - tmp1 = in1 ^ in4; - tmp2 = in6 ^ in7; - out7 = tmp0 ^ tmp2; - out1 = tmp0 ^ in5; - out3 = tmp1 ^ in2; - out5 = tmp1 ^ in5 ^ in6; - out2 = tmp2 ^ in0; - out4 = out5 ^ out7 ^ in0; - out6 = out2 ^ in2 ^ in5; - out0 = out5 ^ out6 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_56(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in2 ^ in4; - tmp0 = in0 ^ in2; - out4 = in0 ^ in5; - out7 = in1 ^ in3; - out5 = in1 ^ in6; - out6 = tmp0 ^ in7; - out2 = tmp0 ^ out5; - out1 = out4 ^ in3; - out3 = out7 ^ in4 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_57(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in5; - tmp1 = in1 ^ in7; - out0 = in0 ^ in2 ^ in4; - out5 = in1 ^ in5 ^ in6; - out4 = tmp0 ^ in4; - out1 = tmp0 ^ in1 ^ in3; - out2 = tmp0 ^ out5; - out3 = tmp1 ^ in4; - out7 = tmp1 ^ in3; - out6 = tmp1 ^ out2 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_58(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in2 ^ in5; - tmp0 = in2 ^ in3 ^ in4; - out5 = tmp0 ^ in1; - out6 = tmp0 ^ in0 ^ in5; - out3 = out6 ^ in7; - tmp1 = out2 ^ out5; - out7 = tmp1 ^ in6; - out4 = tmp1 ^ out3 ^ in3; - out0 = out4 ^ out7 ^ in0; - out1 = tmp0 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_59(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in5; - tmp0 = in0 ^ in5 ^ in7; - out3 = tmp0 ^ in2 ^ in4; - out0 = out3 ^ in6; - tmp1 = out0 ^ in7; - out6 = tmp1 ^ in3; - out5 = out6 ^ in0 ^ in1 ^ in6; - out4 = tmp0 ^ out5; - out1 = tmp1 ^ out4; - out7 = out1 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_5A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = in2 ^ in5; - out5 = tmp0 ^ in3; - out4 = tmp0 ^ in0; - tmp2 = tmp1 ^ in4; - out2 = tmp1 ^ in1 ^ in7; - out7 = tmp2 ^ out5; - out6 = out4 ^ out7 ^ in5; - out0 = tmp2 ^ in6; - out1 = out0 ^ out6 ^ in7; - out3 = tmp1 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_5B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - tmp1 = in0 ^ in4; - tmp2 = in1 ^ in5; - out5 = tmp0 ^ tmp2; - tmp3 = tmp1 ^ in6; - out3 = tmp1 ^ in5; - out2 = tmp2 ^ in7; - tmp4 = out3 ^ in2; - out7 = out2 ^ in3 ^ in4; - out0 = tmp4 ^ in6; - out6 = tmp0 ^ tmp3; - out4 = tmp2 ^ tmp4; - out1 = tmp3 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_5C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in6; - tmp1 = in0 ^ in2 ^ in5; - out1 = tmp0 ^ in5; - tmp2 = tmp0 ^ in1; - out2 = tmp1 ^ in6; - out6 = tmp1 ^ in3; - out4 = tmp2 ^ in0; - out7 = tmp2 ^ in4; - out3 = tmp1 ^ out7; - out0 = out3 ^ out4 ^ in7; - out5 = out0 ^ in1 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_5D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in0 ^ in6; - out2 = tmp1 ^ in5; - tmp2 = out2 ^ in3; - out6 = tmp2 ^ in2; - out1 = tmp0 ^ tmp2; - tmp3 = out1 ^ in4 ^ in5; - out4 = tmp3 ^ in0; - out7 = tmp3 ^ in7; - tmp4 = out4 ^ out6; - out5 = tmp4 ^ in7; - out0 = tmp0 ^ out5; - out3 = tmp1 ^ tmp4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_5E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - tmp1 = in3 ^ in5; - tmp2 = in1 ^ in7; - out7 = in1 ^ in3 ^ in4; - out0 = tmp0 ^ in4; - tmp3 = tmp1 ^ in0; - out5 = tmp2 ^ in2; - out1 = tmp3 ^ in6; - out6 = tmp0 ^ tmp3; - tmp4 = tmp2 ^ out1; - out3 = tmp4 ^ in4; - out4 = tmp1 ^ tmp4; - out2 = tmp0 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_5F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in5; - tmp1 = in0 ^ in6; - tmp2 = tmp0 ^ in7; - tmp3 = tmp1 ^ in3; - out2 = tmp1 ^ tmp2; - out5 = tmp2 ^ in2; - out6 = tmp3 ^ in2; - out3 = out2 ^ in4; - out4 = out3 ^ in5; - out1 = tmp0 ^ tmp3; - out7 = tmp3 ^ out4; - out0 = out4 ^ out5 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_60(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in2 ^ in5; - tmp0 = in3 ^ in6; - out1 = in3 ^ in4 ^ in7; - out7 = out4 ^ in1; - tmp1 = out4 ^ in4; - out0 = tmp0 ^ in2; - out5 = tmp0 ^ in0; - out2 = tmp0 ^ tmp1; - out3 = tmp1 ^ in7; - out6 = out3 ^ out7 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_61(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - out4 = tmp0 ^ in4; - tmp1 = out4 ^ in3; - out3 = tmp1 ^ in7; - out2 = tmp1 ^ in2 ^ in6; - out1 = tmp0 ^ out3 ^ in1; - out0 = out2 ^ out4 ^ in0; - out7 = tmp1 ^ out1; - out6 = out0 ^ out1 ^ in2; - out5 = tmp0 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_62(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in4 ^ in5; - tmp0 = in0 ^ in3 ^ in4; - out1 = tmp0 ^ in7; - out5 = tmp0 ^ in6; - tmp1 = out1 ^ in0; - tmp2 = tmp1 ^ out3; - out4 = tmp2 ^ in2; - tmp3 = tmp2 ^ in1; - out0 = out4 ^ in5 ^ in6; - out7 = tmp3 ^ out0; - out6 = tmp0 ^ tmp3; - out2 = tmp1 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_63(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in4; - tmp1 = in1 ^ in7; - out3 = tmp0 ^ in5; - tmp2 = out3 ^ in6; - out4 = out3 ^ in2 ^ in7; - out5 = tmp2 ^ in0; - tmp3 = out5 ^ in3; - out0 = tmp3 ^ out4; - out2 = tmp1 ^ tmp2; - out6 = tmp1 ^ tmp3; - tmp4 = tmp0 ^ out2; - out1 = tmp4 ^ out5; - out7 = tmp4 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_64(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in2 ^ in3; - out1 = in3 ^ in4; - out7 = in1 ^ in2; - tmp0 = in4 ^ in5; - tmp1 = in0 ^ in7; - out4 = in5 ^ in6 ^ in7; - out2 = tmp0 ^ out0 ^ in0; - out3 = tmp0 ^ out7 ^ in6; - out5 = tmp1 ^ in6; - out6 = tmp1 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_65(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = in4 ^ in5; - tmp2 = in6 ^ in7; - out7 = in1 ^ in2 ^ in7; - out1 = in1 ^ in3 ^ in4; - out0 = tmp0 ^ in2; - out2 = tmp0 ^ tmp1; - out4 = tmp1 ^ tmp2; - tmp3 = tmp2 ^ in0; - out3 = out4 ^ out7 ^ in3; - out5 = tmp3 ^ in5; - out6 = tmp3 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_66(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = in2 ^ in3; - tmp2 = in0 ^ in4; - out7 = tmp0 ^ in6; - out0 = tmp1 ^ in7; - out1 = tmp2 ^ in3; - tmp3 = tmp2 ^ in6; - tmp4 = out1 ^ in5; - out5 = tmp3 ^ in7; - out4 = tmp3 ^ tmp4; - out2 = tmp0 ^ tmp4 ^ in7; - out6 = tmp1 ^ out2 ^ in4; - out3 = tmp3 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_67(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = tmp0 ^ in1; - tmp2 = tmp0 ^ in7; - out1 = tmp1 ^ in4; - out0 = tmp2 ^ in2; - tmp3 = out1 ^ in7; - out2 = tmp3 ^ in5; - out3 = out2 ^ in0 ^ in6; - out7 = tmp1 ^ out0 ^ in6; - out5 = tmp1 ^ out3; - out4 = tmp2 ^ out5; - out6 = tmp3 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_68(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in4; - tmp1 = in2 ^ in3 ^ in5; - tmp2 = tmp0 ^ in1; - tmp3 = tmp0 ^ in6; - out0 = tmp1 ^ in6; - out6 = tmp2 ^ in0; - out7 = tmp1 ^ tmp2; - out1 = tmp3 ^ in7; - out2 = out1 ^ in2; - out4 = tmp2 ^ out2; - out3 = out4 ^ out6 ^ in3; - out5 = tmp3 ^ out3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_69(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in6 ^ in7; - out2 = tmp0 ^ in3 ^ in4; - out1 = out2 ^ in1; - out3 = out2 ^ in0 ^ in2; - out4 = out1 ^ in2 ^ in3; - out6 = out1 ^ in0 ^ in7; - out7 = out4 ^ in5 ^ in6; - out5 = out4 ^ out6 ^ in5; - out0 = tmp0 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_6A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in6; - out3 = in0 ^ in4 ^ in6; - tmp1 = tmp0 ^ in3; - out4 = tmp1 ^ in1; - tmp2 = tmp1 ^ in7; - out2 = out4 ^ in4; - out0 = tmp2 ^ in5; - out5 = tmp2 ^ out3; - out7 = out2 ^ in3 ^ in5; - out1 = tmp0 ^ out5; - out6 = tmp1 ^ out7 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_6B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in6; - out2 = tmp0 ^ in1 ^ in3; - out4 = out2 ^ in2; - tmp1 = out2 ^ in0; - out7 = out4 ^ in3 ^ in5 ^ in7; - out1 = tmp1 ^ in7; - out3 = tmp1 ^ in1; - out6 = tmp1 ^ in5; - out0 = tmp1 ^ out7 ^ in6; - out5 = tmp0 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_6C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in1; - tmp0 = in2 ^ in3; - out5 = in0 ^ in2; - out1 = in3 ^ in4 ^ in6; - tmp1 = out5 ^ in1; - out0 = tmp0 ^ in5; - out6 = tmp0 ^ tmp1; - out3 = tmp1 ^ in4; - out7 = out3 ^ in0; - out2 = out6 ^ out7 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_6D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in1 ^ in4; - tmp0 = in0 ^ in2; - tmp1 = out4 ^ in3; - out7 = out4 ^ in2 ^ in7; - out5 = tmp0 ^ in5; - out3 = tmp0 ^ tmp1; - out1 = tmp1 ^ in6; - out0 = out5 ^ in3; - out2 = out3 ^ out7 ^ in4; - out6 = out1 ^ in0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_6E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in3; - tmp1 = in0 ^ in4; - out4 = tmp0 ^ in7; - out6 = tmp0 ^ in0 ^ in5; - out5 = tmp1 ^ in2; - tmp2 = tmp1 ^ in3; - out3 = tmp2 ^ out4; - out1 = tmp2 ^ in6; - out2 = tmp0 ^ out5; - out0 = out2 ^ out3 ^ in5; - out7 = out1 ^ out2 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_6F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in7; - tmp1 = tmp0 ^ in4; - tmp2 = tmp0 ^ in0 ^ in2; - out4 = tmp1 ^ in1; - out0 = tmp2 ^ in5; - out3 = out4 ^ in0; - out2 = out3 ^ in7; - out1 = out2 ^ in6; - out6 = out1 ^ in4 ^ in5; - out7 = tmp2 ^ out1; - out5 = tmp1 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_70(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in2; - tmp0 = in2 ^ in4; - out2 = in2 ^ in3 ^ in5; - tmp1 = tmp0 ^ in6; - tmp2 = out2 ^ in7; - out0 = tmp1 ^ in3; - out4 = tmp1 ^ in0; - out7 = tmp2 ^ in1; - out6 = out4 ^ in1; - out5 = out7 ^ in0 ^ in2; - out1 = tmp0 ^ tmp2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_71(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in3 ^ in5; - out3 = in2 ^ in3; - tmp0 = in0 ^ in2; - tmp1 = out2 ^ in1; - out4 = tmp0 ^ in6; - tmp2 = tmp0 ^ in1; - out7 = tmp1 ^ in2; - out1 = tmp1 ^ in4 ^ in7; - out0 = out4 ^ in3 ^ in4; - out6 = tmp2 ^ in4; - out5 = tmp2 ^ out3 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_72(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in7; - tmp0 = in0 ^ in4; - tmp1 = tmp0 ^ in3 ^ in7; - out1 = tmp1 ^ in5; - out5 = out1 ^ in1; - tmp2 = tmp0 ^ out5; - out2 = tmp2 ^ in2; - out7 = out2 ^ in6; - out6 = tmp1 ^ out7; - out4 = tmp2 ^ out6; - out0 = out4 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_73(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in3 ^ in7; - out2 = out3 ^ in1 ^ in5; - out1 = out2 ^ in0 ^ in4; - out5 = out1 ^ in5; - out6 = out1 ^ out3 ^ in2; - out0 = out2 ^ out6 ^ in6; - out7 = out0 ^ out1 ^ in3; - out4 = out0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_74(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in4; - tmp1 = in1 ^ in2 ^ in6; - out4 = in0 ^ in4 ^ in7; - out5 = in0 ^ in1 ^ in5; - out0 = tmp0 ^ in2; - out1 = tmp0 ^ in5; - out3 = tmp1 ^ in7; - out6 = tmp1 ^ in0; - out2 = tmp1 ^ out5 ^ in3; - out7 = out3 ^ in3 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_75(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in0 ^ in7; - tmp0 = in1 ^ in3; - out5 = in0 ^ in1; - out7 = tmp0 ^ in2; - tmp1 = tmp0 ^ in4; - out6 = out5 ^ in2; - tmp2 = out7 ^ in6; - out1 = tmp1 ^ in5; - out0 = tmp1 ^ out6; - out3 = tmp2 ^ in7; - out2 = tmp2 ^ out6 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_76(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in1 ^ in6; - tmp0 = in0 ^ in5; - tmp1 = in3 ^ in7; - tmp2 = tmp0 ^ in4; - tmp3 = tmp1 ^ in2; - out5 = tmp2 ^ in1; - out1 = tmp2 ^ in3; - out0 = tmp3 ^ in4; - out4 = out1 ^ in5; - out7 = tmp3 ^ out3; - out2 = tmp0 ^ out7; - out6 = tmp1 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_77(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in0 ^ in3; - tmp0 = in1 ^ in4; - tmp1 = in1 ^ in6; - tmp2 = out4 ^ in5; - out5 = tmp0 ^ in0; - out1 = tmp0 ^ tmp2; - out3 = tmp1 ^ in3; - out2 = tmp1 ^ tmp2 ^ in7; - out7 = out3 ^ in2; - tmp3 = out7 ^ in6; - out6 = tmp2 ^ tmp3; - out0 = tmp3 ^ out5 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_78(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = in2 ^ in7; - tmp2 = in0 ^ in5 ^ in6; - out2 = tmp1 ^ in3; - out3 = tmp2 ^ in2; - out5 = out3 ^ in1 ^ in3; - out0 = tmp0 ^ out3 ^ in4; - out1 = tmp1 ^ out0; - out4 = out1 ^ out5 ^ in5; - out7 = tmp0 ^ out4; - out6 = tmp2 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_79(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in3 ^ in7; - tmp0 = in3 ^ in4; - tmp1 = in1 ^ in5; - tmp2 = tmp1 ^ in2; - out4 = tmp2 ^ in0 ^ in7; - tmp3 = out4 ^ in5; - out5 = tmp3 ^ out2 ^ in6; - out7 = tmp0 ^ tmp2; - out6 = tmp0 ^ tmp3; - out3 = tmp1 ^ out5; - out0 = out3 ^ in4; - out1 = tmp3 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_7A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - out2 = tmp0 ^ in3; - tmp1 = out2 ^ in4; - out4 = tmp1 ^ in0 ^ in5; - out5 = out4 ^ in6; - out6 = out5 ^ in7; - out7 = out6 ^ in0; - out0 = out7 ^ in1; - out1 = tmp0 ^ out6; - out3 = tmp1 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_7B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in1 ^ in3; - tmp0 = in0 ^ in5; - out4 = tmp0 ^ out2 ^ in2; - tmp1 = out4 ^ in4; - out6 = tmp1 ^ in7; - out5 = tmp1 ^ in5 ^ in6; - out0 = out6 ^ in1 ^ in6; - tmp2 = out0 ^ in2; - out1 = tmp2 ^ in1; - out3 = tmp2 ^ in4; - out7 = tmp0 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_7C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in5; - tmp1 = tmp0 ^ in4; - out0 = tmp1 ^ in2; - out1 = tmp1 ^ in6; - out7 = out0 ^ in1 ^ in5 ^ in7; - out5 = out1 ^ out7 ^ in0; - out3 = out5 ^ in6; - out6 = tmp0 ^ out5; - out2 = out6 ^ in1; - out4 = out2 ^ out7 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_7D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = tmp0 ^ in3; - tmp2 = tmp0 ^ in6; - out7 = tmp1 ^ in4; - tmp3 = tmp2 ^ in0; - out5 = tmp3 ^ in7; - out4 = tmp3 ^ in2 ^ in5; - out2 = tmp1 ^ out5; - out6 = tmp2 ^ out2; - out0 = out4 ^ out7 ^ in6; - out1 = tmp3 ^ out0; - out3 = out6 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_7E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in4; - tmp1 = in0 ^ in5; - out1 = tmp0 ^ tmp1 ^ in6; - out3 = tmp1 ^ in1; - out4 = out1 ^ in1 ^ in7; - tmp2 = out4 ^ in3; - out5 = tmp2 ^ in2; - out6 = tmp0 ^ out5; - out7 = tmp1 ^ out4 ^ in2; - out2 = out6 ^ in5 ^ in7; - out0 = tmp2 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_7F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in7; - tmp1 = tmp0 ^ in3 ^ in5; - tmp2 = tmp1 ^ in0; - out0 = tmp2 ^ in4; - out6 = tmp2 ^ in1; - out3 = tmp0 ^ out6; - tmp3 = out3 ^ in6; - out1 = tmp3 ^ in4; - out2 = tmp3 ^ in5; - out4 = tmp3 ^ in7; - out5 = tmp1 ^ out1; - out7 = out0 ^ out4 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_80(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - tmp1 = in4 ^ in5; - out1 = in2 ^ in6 ^ in7; - out5 = tmp0 ^ in4; - tmp2 = tmp0 ^ in1; - out6 = tmp1 ^ in3; - out7 = tmp1 ^ in0 ^ in6; - out4 = tmp2 ^ in7; - out3 = tmp2 ^ out6; - out2 = out3 ^ out5 ^ in6; - out0 = out2 ^ in3 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_81(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in6; - tmp1 = tmp0 ^ in3; - out6 = tmp1 ^ in5; - out5 = out6 ^ in2 ^ in6; - out3 = out5 ^ in1; - out2 = tmp0 ^ out3; - out1 = out3 ^ out6 ^ in7; - out4 = tmp1 ^ out1; - out7 = out2 ^ out4 ^ in0; - out0 = out7 ^ in1 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_82(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in1 ^ in2; - tmp0 = in6 ^ in7; - out5 = in2 ^ in3; - out6 = in3 ^ in4; - out7 = in0 ^ in4 ^ in5; - out0 = in1 ^ in5 ^ in6; - out1 = tmp0 ^ in0 ^ in2; - out2 = tmp0 ^ in3 ^ in5; - out3 = tmp0 ^ out0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_83(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in2 ^ in5; - tmp2 = in3 ^ in6; - out4 = in1 ^ in2 ^ in4; - out0 = tmp0 ^ in5 ^ in6; - out5 = tmp1 ^ in3; - tmp3 = tmp1 ^ in7; - out6 = tmp2 ^ in4; - out2 = tmp2 ^ tmp3; - tmp4 = tmp3 ^ out4; - out1 = tmp3 ^ out0; - out3 = tmp4 ^ in3; - out7 = tmp0 ^ tmp4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_84(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in2 ^ in6; - out6 = in3 ^ in5; - out0 = in1 ^ in5 ^ in7; - out7 = in0 ^ in4 ^ in6; - out4 = in1 ^ in3 ^ in6; - out5 = in2 ^ in4 ^ in7; - out2 = out6 ^ in0 ^ in1; - out3 = out5 ^ in5 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_85(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in6; - tmp1 = in3 ^ in6; - tmp2 = tmp0 ^ in4; - out1 = tmp0 ^ in2; - out6 = tmp1 ^ in5; - out4 = tmp2 ^ in3; - tmp3 = out1 ^ out6; - out2 = tmp3 ^ in0; - out3 = tmp2 ^ tmp3 ^ in7; - out7 = out2 ^ out3 ^ in1; - out5 = tmp1 ^ out3; - out0 = tmp2 ^ out7 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_86(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out6 = in3; - out7 = in0 ^ in4; - out0 = in1 ^ in5; - out5 = in2 ^ in7; - out3 = in4 ^ in5 ^ in6; - out1 = in0 ^ in2 ^ in6; - out4 = in1 ^ in6 ^ in7; - out2 = in0 ^ in3 ^ in5 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_87(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out6 = in3 ^ in6; - tmp0 = in0 ^ in1; - out7 = in0 ^ in4 ^ in7; - out5 = in2 ^ in5 ^ in7; - out3 = out6 ^ in4 ^ in5; - out0 = tmp0 ^ in5; - tmp1 = tmp0 ^ in6; - out2 = out5 ^ in0 ^ in3; - out1 = tmp1 ^ in2; - out4 = tmp1 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_88(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in2 ^ in7; - tmp0 = in5 ^ in6; - out0 = in1 ^ in6 ^ in7; - out6 = in4 ^ in5 ^ in7; - out3 = out0 ^ out1 ^ in0 ^ in4; - out7 = tmp0 ^ in0; - tmp1 = tmp0 ^ in3; - out2 = out0 ^ in3; - out4 = tmp1 ^ in2; - out5 = tmp1 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_89(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in7; - tmp1 = in2 ^ in7; - tmp2 = tmp0 ^ in6; - out1 = tmp1 ^ in1; - out7 = tmp2 ^ in5; - out0 = tmp2 ^ in1; - out2 = out1 ^ in3 ^ in6; - out6 = out7 ^ in0 ^ in4; - out5 = out6 ^ in3; - out3 = tmp0 ^ out2 ^ in4; - out4 = tmp1 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_8A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in1 ^ in6; - out7 = in0 ^ in5; - out2 = in3 ^ in6; - out6 = in4 ^ in7; - out1 = in0 ^ in2 ^ in7; - out3 = out0 ^ out6 ^ in0; - out4 = out1 ^ out7 ^ in6; - out5 = out2 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_8B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in3 ^ in6; - tmp2 = in5 ^ in7; - tmp3 = tmp0 ^ in7; - out0 = tmp0 ^ in6; - out2 = tmp1 ^ in2; - out5 = tmp1 ^ tmp2; - out7 = tmp2 ^ in0; - tmp4 = tmp3 ^ in4; - out1 = tmp3 ^ in2; - out6 = tmp4 ^ out0; - out4 = out6 ^ in2 ^ in5; - out3 = tmp1 ^ tmp4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_8C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in2; - out0 = in1 ^ in7; - out7 = in0 ^ in6; - out5 = in4 ^ in6; - out6 = in5 ^ in7; - out2 = out0 ^ in0 ^ in3; - out3 = out5 ^ out7 ^ in2 ^ in7; - out4 = out6 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_8D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in1 ^ in2; - tmp0 = in6 ^ in7; - out0 = in0 ^ in1 ^ in7; - out5 = in4 ^ in5 ^ in6; - out6 = tmp0 ^ in5; - out7 = tmp0 ^ in0; - out4 = tmp0 ^ out5 ^ in3; - out2 = out0 ^ in2 ^ in3; - out3 = out2 ^ in1 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_8E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in1; - out4 = in5; - out7 = in0; - out5 = in6; - out6 = in7; - out3 = in0 ^ in4; - out1 = in0 ^ in2; - out2 = in0 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_8F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in0 ^ in1; - tmp0 = in0 ^ in3; - out4 = in4 ^ in5; - out7 = in0 ^ in7; - out5 = in5 ^ in6; - out6 = in6 ^ in7; - out1 = out0 ^ in2; - out2 = tmp0 ^ in2; - out3 = tmp0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_90(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = in2 ^ in6 ^ in7; - out3 = tmp0 ^ in7; - out1 = tmp1 ^ in5; - tmp2 = out1 ^ in4; - out6 = tmp2 ^ in3; - out5 = out6 ^ in1; - out4 = out5 ^ in0; - out0 = tmp0 ^ tmp2; - out7 = tmp0 ^ out4; - out2 = tmp1 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_91(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in4; - tmp1 = tmp0 ^ in3 ^ in5; - out2 = tmp1 ^ in1; - out6 = tmp1 ^ in7; - tmp2 = out2 ^ in5 ^ in7; - out3 = tmp2 ^ in4; - out5 = tmp2 ^ in6; - out1 = tmp1 ^ out5 ^ in2; - tmp3 = out1 ^ in0; - out4 = tmp3 ^ in3; - out0 = tmp0 ^ tmp3; - out7 = tmp2 ^ tmp3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_92(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in1; - tmp0 = in4 ^ in5; - tmp1 = tmp0 ^ in1; - out2 = tmp0 ^ in3 ^ in7; - out0 = tmp1 ^ in6; - out7 = out2 ^ in0; - out4 = out0 ^ in0 ^ in2; - out5 = out4 ^ out7 ^ in5; - out6 = tmp1 ^ out5; - out1 = out6 ^ out7 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_93(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in1 ^ in3; - tmp0 = in2 ^ in7; - tmp1 = out3 ^ in6; - tmp2 = tmp0 ^ in4; - out5 = tmp0 ^ tmp1; - out6 = tmp2 ^ in3; - out2 = out6 ^ in5; - out0 = out2 ^ out5 ^ in0; - out7 = tmp1 ^ out0; - out1 = tmp2 ^ out0; - out4 = out1 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_94(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in2 ^ in6; - tmp0 = in1 ^ in4 ^ in5; - out1 = out3 ^ in5; - out5 = tmp0 ^ out3; - out0 = tmp0 ^ in7; - out4 = tmp0 ^ in0 ^ in3; - out6 = out1 ^ in3 ^ in7; - out2 = out4 ^ in6; - out7 = out0 ^ out2 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_95(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - out3 = tmp0 ^ in6; - tmp1 = tmp0 ^ in7; - tmp2 = out3 ^ in0; - out6 = tmp1 ^ in5; - tmp3 = tmp2 ^ in4; - out7 = tmp3 ^ in2; - tmp4 = tmp3 ^ in5; - out2 = tmp4 ^ in1; - tmp5 = out2 ^ in6; - out0 = tmp1 ^ tmp5; - out1 = tmp5 ^ out7; - out4 = tmp2 ^ out1; - out5 = tmp4 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_96(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in6 ^ in7; - tmp0 = in1 ^ in5; - tmp1 = in5 ^ in6; - out6 = out3 ^ in2 ^ in3; - out0 = tmp0 ^ in4; - tmp2 = tmp1 ^ in2; - out4 = out0 ^ in0 ^ in7; - out1 = tmp2 ^ in0; - out5 = tmp2 ^ in1; - out7 = tmp0 ^ out4 ^ in3; - out2 = tmp1 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_97(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in4; - tmp1 = in2 ^ in6; - out3 = in3 ^ in6 ^ in7; - out7 = tmp0 ^ in3; - tmp2 = tmp0 ^ in5; - out5 = tmp1 ^ in1; - out6 = tmp1 ^ out3; - out0 = tmp2 ^ in1; - out2 = tmp2 ^ out3 ^ in2; - tmp3 = out0 ^ in4; - out4 = tmp3 ^ in7; - out1 = tmp1 ^ tmp3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_98(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in5 ^ in7; - tmp1 = in1 ^ in4 ^ in7; - out1 = tmp0 ^ in2; - out0 = tmp1 ^ in6; - out2 = tmp1 ^ in3; - out6 = out0 ^ out1 ^ in1; - out5 = tmp0 ^ out2; - out3 = tmp1 ^ out6 ^ in0; - out7 = out0 ^ out5 ^ in0; - out4 = out6 ^ out7 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_99(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - out5 = in1 ^ in3 ^ in4; - out6 = in2 ^ in4 ^ in5; - out4 = tmp0 ^ in2; - tmp1 = tmp0 ^ in6; - tmp2 = out5 ^ in7; - out7 = tmp1 ^ in5; - out0 = tmp1 ^ tmp2; - out2 = tmp2 ^ in2; - out3 = out0 ^ out6 ^ in3; - out1 = tmp1 ^ out3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_9A(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in3 ^ in4; - tmp0 = in0 ^ in5; - tmp1 = in1 ^ in6; - out5 = in1 ^ in3 ^ in5; - tmp2 = tmp0 ^ in7; - out3 = tmp0 ^ tmp1; - out0 = tmp1 ^ in4; - out7 = tmp2 ^ in3; - out1 = tmp2 ^ in2; - out6 = out0 ^ in1 ^ in2; - out4 = out1 ^ in4 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_9B(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in1 ^ in3; - tmp0 = in3 ^ in5; - out6 = in2 ^ in4; - out4 = in0 ^ in2 ^ in7; - out7 = tmp0 ^ in0; - out2 = out6 ^ in3; - out1 = out4 ^ in1 ^ in5; - out3 = out7 ^ in1 ^ in6; - out0 = tmp0 ^ out3 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_9C(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out1 = in2 ^ in5; - tmp0 = in0 ^ in3 ^ in6; - out3 = out1 ^ in0; - out6 = out1 ^ in6; - out7 = tmp0 ^ in7; - out4 = out7 ^ in4; - out2 = out4 ^ in1; - out0 = tmp0 ^ out2; - out5 = out0 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_9D(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out6 = in2 ^ in5; - tmp0 = in0 ^ in3; - out5 = in1 ^ in4 ^ in7; - out1 = out6 ^ in1; - out3 = tmp0 ^ out6; - out7 = tmp0 ^ in6; - out0 = out5 ^ in0; - out4 = out7 ^ in7; - out2 = out5 ^ out7 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_9E(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in1 ^ in4; - tmp0 = in0 ^ in5; - out6 = in2 ^ in6; - out7 = in0 ^ in3 ^ in7; - out4 = in0 ^ in4 ^ in6; - out5 = in1 ^ in5 ^ in7; - out1 = tmp0 ^ in2; - out3 = tmp0 ^ in7; - out2 = out4 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_9F(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out6 = in2; - out7 = in0 ^ in3; - tmp0 = in0 ^ in1; - out4 = in0 ^ in6; - out5 = in1 ^ in7; - out1 = tmp0 ^ in2 ^ in5; - out2 = out7 ^ in2 ^ in4 ^ in6; - out3 = out7 ^ in5 ^ in7; - out0 = tmp0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A0(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in6; - out2 = tmp0 ^ in7; - tmp1 = tmp0 ^ in5; - out6 = out2 ^ in3 ^ in4; - out0 = tmp1 ^ in3; - tmp2 = out0 ^ in2; - out3 = tmp2 ^ in7; - tmp3 = tmp2 ^ in1; - out5 = tmp3 ^ in0; - out4 = tmp3 ^ out6; - out7 = out5 ^ out6 ^ in1; - out1 = tmp1 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A1(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - tmp1 = tmp0 ^ in1; - tmp2 = tmp0 ^ in4; - out4 = tmp1 ^ in7; - out7 = tmp2 ^ in0; - out6 = tmp2 ^ out4 ^ in3; - out3 = out4 ^ in6; - out2 = out3 ^ in5; - out1 = out2 ^ in4; - out5 = out1 ^ out6 ^ in0; - out0 = tmp1 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A2(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in6; - tmp0 = in1 ^ in3 ^ in5; - out3 = tmp0 ^ in6; - out4 = tmp0 ^ in2 ^ in4; - out0 = out3 ^ in7; - out6 = out0 ^ in4; - out1 = out0 ^ out4 ^ in0; - out7 = out1 ^ in5; - out5 = out7 ^ in3 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A3(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in2 ^ in6; - out3 = in1 ^ in5 ^ in6; - tmp0 = out2 ^ in0; - out4 = out2 ^ out3 ^ in3; - tmp1 = tmp0 ^ in4; - out0 = tmp0 ^ out4 ^ in7; - out5 = tmp1 ^ in3; - out7 = tmp1 ^ in5; - out1 = tmp1 ^ in1 ^ in7; - out6 = tmp1 ^ out0 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A4(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in3; - tmp1 = in2 ^ in4; - tmp2 = in2 ^ in5; - tmp3 = in0 ^ in7; - out0 = tmp0 ^ in5; - out6 = tmp0 ^ in6 ^ in7; - out1 = tmp1 ^ in6; - out7 = tmp1 ^ tmp3; - out3 = tmp2 ^ in3; - tmp4 = tmp2 ^ out1; - out2 = tmp3 ^ in1; - out5 = tmp4 ^ out7; - out4 = tmp4 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A5(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in2 ^ in5; - tmp0 = in1 ^ in6; - tmp1 = in0 ^ in1; - tmp2 = in2 ^ in4; - out6 = in1 ^ in3 ^ in7; - out4 = tmp0 ^ in5; - out1 = tmp0 ^ tmp2; - out0 = tmp1 ^ in3 ^ in5; - out2 = tmp1 ^ in2 ^ in7; - out7 = tmp2 ^ in0; - out5 = tmp0 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A6(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0; - out3 = in3 ^ in5 ^ in7; - out1 = in0 ^ in2 ^ in4 ^ in6; - out0 = out3 ^ in1; - out7 = out1 ^ in7; - out6 = out0 ^ in6; - out5 = out7 ^ in5; - out4 = out6 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A7(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0 ^ in2; - out3 = in5 ^ in7; - out7 = out2 ^ in4 ^ in6; - out6 = out3 ^ in1 ^ in3; - out1 = out7 ^ in1; - out5 = out7 ^ in7; - out0 = out6 ^ in0; - out4 = out6 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A8(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in4; - tmp1 = in1 ^ in6; - tmp2 = in0 ^ in2 ^ in7; - out1 = tmp0 ^ in7; - out4 = tmp0 ^ in6; - out0 = tmp1 ^ in3; - out2 = tmp1 ^ in5; - out6 = tmp1 ^ in4; - out7 = tmp2 ^ in5; - out3 = tmp2 ^ out0 ^ in6; - out5 = out7 ^ in2 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_A9(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in2 ^ in6; - out6 = in1 ^ in4; - out7 = in0 ^ in2 ^ in5; - out5 = in0 ^ in3 ^ in7; - out2 = out4 ^ in1 ^ in5; - out1 = out6 ^ in2 ^ in7; - out0 = out2 ^ out7 ^ in3; - out3 = out1 ^ in0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_AA(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in2; - tmp1 = in1 ^ in3; - tmp2 = in6 ^ in7; - out1 = tmp0 ^ in4 ^ in7; - out3 = tmp1 ^ in0; - out0 = tmp1 ^ tmp2; - out2 = tmp2 ^ in5; - out7 = tmp0 ^ out2; - out6 = out1 ^ out7 ^ in1; - out5 = out0 ^ out6 ^ in0; - out4 = out5 ^ out7 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_AB(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0 ^ in1; - tmp0 = in1 ^ in4; - tmp1 = in0 ^ in7; - out6 = tmp0 ^ in5; - out1 = tmp0 ^ tmp1 ^ in2; - out5 = tmp1 ^ in3 ^ in4; - out0 = tmp0 ^ out5 ^ in6; - out4 = out0 ^ out3 ^ in2; - out2 = out4 ^ in3 ^ in5; - out7 = tmp1 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_AC(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in1 ^ in3; - out1 = in2 ^ in4; - tmp0 = in0 ^ in2; - out4 = in4 ^ in7; - out5 = in0 ^ in5; - out6 = in1 ^ in6; - out7 = tmp0 ^ in7; - out3 = tmp0 ^ in3 ^ in6; - out2 = out5 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_AD(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in7; - out5 = in0; - out6 = in1; - out7 = in0 ^ in2; - out0 = in0 ^ in1 ^ in3; - out2 = out7 ^ in1 ^ in5; - out1 = in1 ^ in2 ^ in4; - out3 = out7 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_AE(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in3 ^ in4; - tmp0 = in0 ^ in4; - tmp1 = in0 ^ in7; - out0 = in1 ^ in3 ^ in7; - out1 = tmp0 ^ in2; - out5 = tmp0 ^ in5; - tmp2 = tmp1 ^ in6; - out2 = tmp1 ^ in5; - out3 = tmp2 ^ in3; - out7 = tmp2 ^ in2; - out6 = tmp2 ^ out2 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_AF(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in3; - tmp0 = in0 ^ in7; - out5 = in0 ^ in4; - out6 = in1 ^ in5; - out7 = in0 ^ in2 ^ in6; - out0 = tmp0 ^ in1 ^ in3; - out3 = tmp0 ^ in6; - out2 = tmp0 ^ in2 ^ in5; - out1 = out5 ^ in1 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B0(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in4; - tmp1 = in3 ^ in6; - out2 = tmp0 ^ in7; - tmp2 = tmp0 ^ tmp1; - out0 = tmp2 ^ in5; - out3 = tmp2 ^ in2; - out6 = out3 ^ in6; - tmp3 = out6 ^ in0 ^ in1; - out7 = tmp3 ^ in5; - out5 = tmp3 ^ out2; - out1 = out0 ^ out5 ^ in0; - out4 = tmp1 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B1(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in4; - out2 = tmp0 ^ in2 ^ in7; - tmp1 = out2 ^ in6; - out1 = tmp1 ^ in5; - out3 = tmp1 ^ in7; - out4 = tmp1 ^ in0; - out6 = out3 ^ in3; - out0 = out6 ^ in0 ^ in2 ^ in5; - out5 = tmp1 ^ out0 ^ in1; - out7 = tmp0 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B2(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in4; - tmp0 = in4 ^ in7; - tmp1 = in1 ^ in3 ^ in6; - out3 = tmp0 ^ tmp1; - tmp2 = tmp1 ^ in0; - out0 = out3 ^ in5; - out4 = tmp2 ^ in2; - tmp3 = out4 ^ in6; - out5 = tmp0 ^ tmp3; - out1 = tmp3 ^ out0; - tmp4 = out1 ^ in7; - out7 = tmp4 ^ in3; - out6 = tmp2 ^ tmp4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B3(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in2 ^ in4; - tmp0 = in0 ^ in5; - tmp1 = in1 ^ in6; - out3 = tmp1 ^ in4 ^ in7; - tmp2 = tmp0 ^ out3; - out0 = tmp2 ^ in3; - out1 = tmp2 ^ in2; - out5 = out0 ^ in2 ^ in6; - out7 = tmp1 ^ out5; - out4 = out7 ^ in1 ^ in5 ^ in7; - out6 = tmp0 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B4(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in0 ^ in1; - out5 = out4 ^ in2; - tmp0 = out4 ^ in4; - out6 = out5 ^ in0 ^ in3; - out7 = tmp0 ^ out6; - out2 = tmp0 ^ in6 ^ in7; - out3 = out7 ^ in0 ^ in7; - out0 = out5 ^ out7 ^ in5; - out1 = out0 ^ out6 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B5(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in2 ^ in4; - out4 = tmp0 ^ in4; - out3 = tmp1 ^ in7; - tmp2 = out4 ^ in5; - out7 = out3 ^ in0 ^ in3; - out0 = tmp2 ^ in3; - out2 = tmp0 ^ out3 ^ in6; - out5 = tmp1 ^ tmp2; - out6 = out2 ^ out7 ^ in2; - out1 = tmp0 ^ out0 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B6(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in3 ^ in4; - tmp0 = in1 ^ in2; - tmp1 = in0 ^ in4; - tmp2 = in3 ^ in5; - tmp3 = out3 ^ in1 ^ in7; - out5 = tmp0 ^ tmp1; - out6 = tmp0 ^ tmp2; - out2 = tmp1 ^ in6; - out4 = tmp1 ^ tmp3; - out0 = tmp3 ^ in5; - out1 = out2 ^ in2 ^ in5; - out7 = tmp2 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B7(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in4; - tmp0 = in0 ^ in4; - out2 = tmp0 ^ in2 ^ in6; - tmp1 = out2 ^ in7; - out1 = out2 ^ in1 ^ in5; - out7 = tmp1 ^ in3; - out5 = out1 ^ in6; - out6 = tmp0 ^ out1 ^ in3; - out0 = tmp1 ^ out6; - out4 = out0 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B8(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in4; - tmp1 = in2 ^ in5; - out2 = tmp0 ^ in5; - out4 = tmp1 ^ in0; - tmp2 = tmp1 ^ in7; - out6 = tmp2 ^ out2; - out7 = out4 ^ in3; - out1 = tmp2 ^ in4; - out3 = tmp0 ^ out7; - out0 = out3 ^ out4 ^ in6; - out5 = out0 ^ in0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_B9(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in2; - tmp1 = in4 ^ in5; - out4 = tmp0 ^ tmp1; - tmp2 = tmp0 ^ in3 ^ in7; - out3 = out4 ^ in1; - out7 = tmp2 ^ in5; - out2 = out3 ^ in0; - out1 = out2 ^ in7; - out6 = out1 ^ in5 ^ in6; - out0 = tmp2 ^ out6; - out5 = tmp1 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_BA(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in5 ^ in7; - out2 = tmp0 ^ in4; - tmp1 = out2 ^ in2; - out1 = tmp1 ^ in0; - out6 = tmp1 ^ in1; - out4 = out1 ^ in3 ^ in4; - tmp2 = out4 ^ out6; - out7 = out4 ^ in6 ^ in7; - out5 = tmp2 ^ in6; - out3 = tmp0 ^ tmp2; - out0 = out6 ^ out7 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_BB(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in2 ^ in4 ^ in5 ^ in7; - tmp0 = out2 ^ in1; - out4 = out2 ^ in0 ^ in3; - out1 = tmp0 ^ in0; - out6 = tmp0 ^ in6; - out3 = out1 ^ in2; - tmp1 = out4 ^ out6 ^ in4; - out0 = tmp1 ^ in7; - out5 = tmp1 ^ in5; - out7 = tmp0 ^ tmp1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_BC(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in2; - tmp1 = in2 ^ in4; - out0 = in1 ^ in3 ^ in4; - out6 = in1 ^ in2 ^ in7; - out7 = tmp0 ^ in3; - out5 = tmp0 ^ out6 ^ in6; - out1 = tmp1 ^ in5; - tmp2 = out1 ^ out5 ^ in1; - out3 = tmp2 ^ in3; - out4 = tmp1 ^ tmp2; - out2 = tmp2 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_BD(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = in1 ^ in4; - out0 = tmp0 ^ tmp1; - out7 = tmp0 ^ in2 ^ in7; - out1 = tmp1 ^ in2 ^ in5; - tmp2 = out1 ^ in0; - out2 = tmp2 ^ in6; - out3 = out2 ^ in1 ^ in7; - out4 = out3 ^ in2; - out5 = tmp1 ^ out4; - out6 = tmp2 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_BE(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3 ^ in6; - out4 = tmp0 ^ in5; - out7 = tmp0 ^ in2; - out3 = out4 ^ in4; - out1 = out3 ^ out7 ^ in0; - out2 = out3 ^ in3 ^ in7; - out0 = out2 ^ out4 ^ in1; - out5 = tmp0 ^ out0; - out6 = out1 ^ out5 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_BF(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in4; - out3 = tmp0 ^ in5 ^ in6; - out4 = out3 ^ in3; - tmp1 = out3 ^ in7; - out2 = tmp1 ^ in2; - out5 = tmp1 ^ in1; - tmp2 = out2 ^ in5; - out7 = tmp2 ^ in3 ^ in4; - tmp3 = tmp0 ^ out5; - out0 = tmp3 ^ out4; - out1 = tmp2 ^ tmp3; - out6 = tmp3 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C0(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in2 ^ in5; - tmp0 = in1 ^ in4; - tmp1 = in3 ^ in6; - out0 = out5 ^ in1; - out4 = tmp0 ^ in7; - out3 = tmp0 ^ tmp1; - out1 = tmp1 ^ in2; - out6 = tmp1 ^ in0; - out7 = out4 ^ in0; - out2 = out4 ^ out5 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C1(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in2; - tmp0 = in0 ^ in1; - out4 = in1 ^ in7; - out6 = in0 ^ in3; - out3 = in1 ^ in4 ^ in6; - tmp1 = tmp0 ^ in2; - out7 = tmp0 ^ in4; - out0 = tmp1 ^ in5; - out1 = tmp1 ^ out6 ^ in6; - out2 = out6 ^ out7 ^ in5 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C2(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in1 ^ in3 ^ in4; - tmp0 = in0 ^ in3 ^ in6; - out5 = in2 ^ in4 ^ in5; - tmp1 = out4 ^ in7; - out1 = tmp0 ^ in2; - out6 = tmp0 ^ in5; - out2 = out5 ^ in3; - out7 = tmp0 ^ tmp1; - out3 = tmp1 ^ in2 ^ in6; - out0 = tmp1 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C3(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in1 ^ in3; - tmp0 = in0 ^ in2; - tmp1 = in3 ^ in5; - out5 = in2 ^ in4; - tmp2 = tmp0 ^ out4; - out2 = tmp1 ^ in4; - out6 = tmp1 ^ in0; - out0 = tmp1 ^ tmp2 ^ in7; - out1 = tmp2 ^ in6; - out7 = out1 ^ out5 ^ in3; - out3 = tmp0 ^ out7 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C4(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in7; - out3 = tmp0 ^ in4; - tmp1 = tmp0 ^ in2; - out1 = tmp1 ^ in6; - out5 = tmp1 ^ in5; - out4 = out1 ^ out3 ^ in1; - out0 = out4 ^ in4 ^ in5; - out2 = out0 ^ out3 ^ in0; - out7 = out1 ^ out2 ^ in7; - out6 = tmp1 ^ out0 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C5(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in4 ^ in7; - tmp0 = in3 ^ in7; - out4 = in1 ^ in2 ^ in6; - out6 = in0 ^ in3 ^ in4; - out5 = tmp0 ^ in2; - out1 = tmp0 ^ out4; - out0 = out4 ^ in0 ^ in5; - out2 = out0 ^ out5 ^ in4; - out7 = tmp0 ^ out2 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C6(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in5 ^ in6; - tmp1 = in1 ^ in7; - tmp2 = tmp0 ^ in0; - tmp3 = tmp0 ^ tmp1; - tmp4 = tmp2 ^ in4; - out0 = tmp3 ^ in2; - out6 = tmp4 ^ in3; - out2 = out6 ^ in2; - out7 = tmp1 ^ tmp4; - out3 = tmp2 ^ out2; - tmp5 = out3 ^ in5; - out5 = tmp5 ^ in7; - out4 = tmp3 ^ tmp5; - out1 = tmp4 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C7(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in2 ^ in4; - tmp0 = in3 ^ in5; - tmp1 = out3 ^ in7; - out6 = tmp0 ^ in0 ^ in4; - out5 = tmp1 ^ in3; - out2 = out6 ^ in6; - out7 = out2 ^ in1 ^ in3; - out0 = tmp1 ^ out7; - out1 = tmp0 ^ out0; - out4 = out1 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C8(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out0 = in1 ^ in2; - out1 = in2 ^ in3; - tmp0 = in5 ^ in6; - tmp1 = in0 ^ in7; - out2 = out1 ^ in1 ^ in4; - out4 = tmp0 ^ in4; - out5 = tmp0 ^ in7; - out6 = tmp1 ^ in6; - out7 = tmp1 ^ in1; - out3 = out2 ^ in0 ^ in2 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_C9(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in5 ^ in6; - out7 = in0 ^ in1; - tmp0 = in1 ^ in3; - out5 = in6 ^ in7; - out6 = in0 ^ in7; - out0 = out7 ^ in2; - out3 = out7 ^ in4 ^ in5; - out1 = tmp0 ^ in2; - out2 = tmp0 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_CA(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in7; - tmp1 = in2 ^ in7; - tmp2 = tmp0 ^ in6; - out0 = tmp1 ^ in1; - tmp3 = tmp1 ^ in3; - out6 = tmp2 ^ in5; - out7 = tmp2 ^ in1; - out2 = tmp3 ^ in4; - out5 = out6 ^ in0 ^ in4; - out4 = out5 ^ in3; - out1 = tmp0 ^ tmp3; - out3 = tmp3 ^ out5 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_CB(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in7; - tmp1 = in5 ^ in7; - out7 = in0 ^ in1 ^ in6; - out5 = tmp0 ^ in6; - out2 = tmp0 ^ in3; - out6 = tmp1 ^ in0; - out4 = tmp1 ^ in3 ^ in6; - tmp2 = out5 ^ out7 ^ in2; - out1 = tmp2 ^ out2; - out0 = tmp2 ^ in4; - out3 = tmp2 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_CC(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in5; - tmp1 = in1 ^ in6; - out1 = in2 ^ in3 ^ in7; - out5 = tmp0 ^ in6; - out0 = tmp1 ^ in2; - tmp2 = out5 ^ in0 ^ in7; - out3 = tmp2 ^ in4; - out6 = tmp0 ^ out3; - out7 = tmp1 ^ tmp2 ^ in3; - tmp3 = out1 ^ out6; - out4 = tmp2 ^ tmp3; - out2 = tmp3 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_CD(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in3 ^ in6; - tmp0 = in0 ^ in1; - tmp1 = in2 ^ in7; - out6 = in0 ^ in4 ^ in7; - out2 = tmp0 ^ out5 ^ in4; - out7 = tmp0 ^ in5; - out0 = tmp0 ^ in2 ^ in6; - out4 = tmp1 ^ in5; - out1 = tmp1 ^ in1 ^ in3; - out3 = out6 ^ in5 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_CE(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in5; - tmp1 = tmp0 ^ in3; - out4 = tmp1 ^ in4; - tmp2 = out4 ^ in6; - out3 = tmp2 ^ in0; - out5 = tmp2 ^ in2; - out2 = out3 ^ in5 ^ in7; - out6 = tmp1 ^ out2; - out7 = out2 ^ out4 ^ in1; - out1 = tmp2 ^ out6; - out0 = tmp0 ^ out7 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_CF(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in6; - tmp1 = in0 ^ in1 ^ in5; - out4 = in2 ^ in3 ^ in5; - out5 = tmp0 ^ in4; - out7 = tmp1 ^ in6; - out1 = tmp1 ^ out4 ^ in7; - tmp2 = out5 ^ in0; - out2 = tmp2 ^ in7; - out3 = tmp2 ^ out4; - out6 = tmp0 ^ out2 ^ in5; - out0 = tmp0 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D0(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - tmp1 = in1 ^ in4; - tmp2 = in2 ^ in5; - out7 = tmp0 ^ tmp1; - out0 = tmp1 ^ tmp2; - tmp3 = tmp2 ^ in3; - out1 = tmp3 ^ in6; - tmp4 = out1 ^ in1; - out2 = tmp4 ^ in7; - out3 = out2 ^ in2; - out4 = tmp0 ^ out3; - out5 = tmp3 ^ out3; - out6 = tmp4 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D1(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in5 ^ in6; - tmp1 = tmp0 ^ in1; - out1 = tmp1 ^ in2; - out2 = tmp1 ^ in7; - out3 = out2 ^ in3; - out5 = out3 ^ in2; - tmp2 = out3 ^ in0; - out4 = tmp2 ^ in4; - out7 = tmp0 ^ out4; - out6 = tmp2 ^ out1 ^ in6; - out0 = out2 ^ out6 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D2(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in5 ^ in6; - out2 = tmp0 ^ in2 ^ in3; - out1 = out2 ^ in0; - out3 = out2 ^ in1; - out4 = out1 ^ in1 ^ in2; - out6 = out1 ^ in6 ^ in7; - out7 = out4 ^ in4 ^ in5; - out5 = out4 ^ out6 ^ in4; - out0 = tmp0 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D3(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in3 ^ in5 ^ in6; - tmp0 = out2 ^ in2; - tmp1 = tmp0 ^ in1; - out1 = tmp1 ^ in0; - out3 = tmp1 ^ in3; - out4 = out1 ^ in2 ^ in4; - tmp2 = out4 ^ in5; - out7 = tmp2 ^ in7; - out0 = tmp0 ^ out7; - tmp3 = out0 ^ in0; - out5 = tmp3 ^ in6; - out6 = tmp2 ^ tmp3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D4(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in3 ^ in5; - tmp0 = in1 ^ in5; - tmp1 = tmp0 ^ in2; - out4 = tmp1 ^ in0; - tmp2 = tmp1 ^ in6; - out2 = out4 ^ in3 ^ in7; - out0 = tmp2 ^ in4; - out5 = tmp2 ^ out3; - out1 = tmp0 ^ out5 ^ in7; - out6 = tmp0 ^ out2 ^ in4; - out7 = tmp1 ^ out6 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D5(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in5; - tmp0 = in0 ^ in4; - tmp1 = tmp0 ^ in1 ^ in5; - out4 = tmp1 ^ in2; - out0 = out4 ^ in6; - tmp2 = tmp0 ^ out0; - out5 = tmp2 ^ in3; - out1 = out5 ^ in7; - out6 = tmp1 ^ out1; - out7 = tmp2 ^ out6; - out2 = out7 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D6(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2 ^ in4 ^ in6; - out5 = tmp0 ^ in3; - out0 = tmp0 ^ in5 ^ in7; - out3 = out0 ^ out5 ^ in2; - tmp1 = out3 ^ in0; - out1 = tmp1 ^ in6; - out2 = tmp1 ^ in7; - out4 = tmp1 ^ in1; - out6 = tmp1 ^ in4; - out7 = tmp0 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D7(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in3; - out3 = in2 ^ in5 ^ in7; - out2 = tmp0 ^ in5; - tmp1 = tmp0 ^ out3 ^ in1; - out1 = tmp1 ^ in6; - out4 = tmp1 ^ in4; - tmp2 = out1 ^ in4; - out6 = tmp2 ^ in1; - out7 = tmp2 ^ in2; - out0 = tmp2 ^ in3; - out5 = tmp2 ^ in0 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D8(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in0; - out5 = in1; - tmp0 = in1 ^ in2; - out6 = in0 ^ in2; - out0 = tmp0 ^ in4; - tmp1 = tmp0 ^ in3; - out7 = tmp1 ^ out6; - out2 = tmp1 ^ in6; - out3 = out7 ^ in7; - out1 = tmp1 ^ in1 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_D9(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in0 ^ in4; - out5 = in1 ^ in5; - out2 = in1 ^ in3 ^ in6; - out3 = in0 ^ in1 ^ in7; - out6 = in0 ^ in2 ^ in6; - out0 = out4 ^ in1 ^ in2; - out1 = out5 ^ in2 ^ in3; - out7 = out3 ^ in3; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_DA(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out5 = in1 ^ in4; - tmp0 = in2 ^ in7; - tmp1 = in0 ^ in2 ^ in3; - out0 = tmp0 ^ out5; - out4 = tmp0 ^ tmp1; - out2 = tmp0 ^ in3 ^ in6; - out1 = tmp1 ^ in5; - out3 = tmp1 ^ in1; - out6 = out1 ^ in3; - out7 = out3 ^ in2 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_DB(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in1 ^ in5; - tmp2 = in3 ^ in7; - out3 = tmp0 ^ in2; - out5 = tmp1 ^ in4; - out6 = tmp1 ^ out3 ^ in6; - out2 = tmp2 ^ in6; - tmp3 = tmp2 ^ in4; - tmp4 = out3 ^ in3; - out4 = tmp3 ^ in0; - out1 = tmp4 ^ in5; - out0 = tmp3 ^ tmp4; - out7 = tmp0 ^ out2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_DC(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in2; - tmp1 = in0 ^ in3; - out6 = tmp0 ^ in4; - tmp2 = tmp0 ^ in7; - out3 = tmp1 ^ in6; - tmp3 = tmp1 ^ in1; - out1 = tmp1 ^ tmp2 ^ in5; - out4 = tmp2 ^ in6; - out2 = tmp3 ^ in2; - out7 = tmp3 ^ in5; - out5 = tmp2 ^ out2; - out0 = out2 ^ out3 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_DD(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0 ^ in6; - out2 = in0 ^ in1 ^ in3; - out6 = out3 ^ in2 ^ in4; - out7 = out2 ^ in5 ^ in7; - out0 = out6 ^ in1; - out4 = out6 ^ in7; - out5 = out7 ^ in0; - out1 = out5 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_DE(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3 ^ in6; - tmp1 = in3 ^ in4 ^ in7; - out4 = tmp0 ^ in0; - out5 = tmp1 ^ in1; - out3 = out4 ^ in7; - out2 = out3 ^ in6; - out1 = out2 ^ in5; - out6 = tmp1 ^ out1; - out0 = tmp0 ^ out5; - out7 = out0 ^ out1 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_DF(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0 ^ in3 ^ in7; - tmp0 = out2 ^ in1 ^ in5; - out1 = tmp0 ^ in2; - out7 = tmp0 ^ in6; - out5 = tmp0 ^ in0 ^ in4; - tmp1 = out1 ^ out5 ^ in6; - out4 = tmp1 ^ in3; - out6 = tmp1 ^ in5; - tmp2 = tmp1 ^ in7; - out0 = tmp2 ^ in1; - out3 = tmp2 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E0(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in1 ^ in7; - tmp0 = in2 ^ in4; - out4 = out3 ^ in3 ^ in5; - out2 = tmp0 ^ in1; - tmp1 = tmp0 ^ in6; - out0 = out4 ^ in2; - out6 = out4 ^ in0; - out1 = tmp1 ^ in3; - out5 = tmp1 ^ in0; - out7 = out5 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E1(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in1 ^ in4; - tmp0 = in1 ^ in7; - out3 = tmp0 ^ in3; - tmp1 = out3 ^ in5; - out4 = tmp1 ^ in4; - tmp2 = tmp1 ^ in0; - out0 = tmp2 ^ in2; - out6 = tmp2 ^ in6; - tmp3 = out0 ^ out4 ^ in6; - out5 = tmp3 ^ in5; - out7 = tmp0 ^ tmp3; - out1 = tmp2 ^ out5 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E2(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in1 ^ in2; - out4 = in1 ^ in5; - out2 = in2 ^ in4 ^ in7; - out5 = in0 ^ in2 ^ in6; - out0 = out3 ^ in3 ^ in5; - out7 = out3 ^ in0 ^ in4; - out6 = out2 ^ out7 ^ in3; - out1 = out5 ^ in3 ^ in4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E3(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in4 ^ in7; - tmp0 = in1 ^ in3; - out3 = tmp0 ^ in2; - tmp1 = out3 ^ in0; - out0 = tmp1 ^ in5; - tmp2 = tmp1 ^ in4; - out1 = tmp2 ^ in6; - tmp3 = tmp2 ^ in3; - out7 = tmp3 ^ in7; - out6 = out1 ^ out2 ^ in2; - tmp4 = tmp0 ^ out0; - out5 = tmp4 ^ in6; - out4 = tmp3 ^ tmp4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E4(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in6; - tmp0 = in0 ^ in4; - tmp1 = tmp0 ^ in2 ^ in6; - out2 = tmp1 ^ in1; - out7 = out2 ^ in5; - tmp2 = tmp0 ^ out7; - out4 = tmp2 ^ in3; - out0 = out4 ^ in7; - out6 = tmp1 ^ out0; - out5 = tmp2 ^ out6; - out1 = out5 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E5(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in3 ^ in6; - tmp0 = in0 ^ in1; - tmp1 = in5 ^ in7; - out2 = tmp0 ^ in4 ^ in6; - tmp2 = tmp1 ^ out2; - out6 = tmp2 ^ in3; - out7 = tmp2 ^ in2; - out0 = out6 ^ in2 ^ in4; - out5 = out6 ^ in1 ^ in2; - out1 = tmp0 ^ out5 ^ in5; - out4 = tmp1 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E6(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in2 ^ in6 ^ in7; - out2 = out3 ^ in0 ^ in4; - out4 = out3 ^ in1 ^ in5; - out1 = out2 ^ in3; - out7 = out2 ^ out4 ^ in2; - out0 = out4 ^ in3 ^ in7; - out5 = out1 ^ in4; - out6 = out0 ^ out2 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E7(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in3; - out3 = tmp0 ^ in6 ^ in7; - tmp1 = out3 ^ in0; - out5 = tmp1 ^ in5; - tmp2 = tmp1 ^ in4; - tmp3 = out5 ^ in7; - out1 = tmp2 ^ in1; - out0 = tmp3 ^ in1; - out6 = out1 ^ in2; - out2 = tmp0 ^ tmp2; - tmp4 = tmp3 ^ out6; - out4 = tmp4 ^ in6; - out7 = tmp4 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E8(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in3 ^ in6; - tmp0 = in4 ^ in7; - out1 = in2 ^ in3 ^ in4; - out5 = tmp0 ^ in0; - tmp1 = tmp0 ^ in1; - tmp2 = tmp1 ^ in5; - out0 = tmp1 ^ out1; - out2 = tmp2 ^ in2; - out6 = tmp2 ^ out5; - tmp3 = out6 ^ in6; - out3 = tmp3 ^ in7; - out7 = tmp3 ^ in2 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_E9(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = in3 ^ in6; - tmp2 = tmp0 ^ in6; - out4 = tmp1 ^ in4; - out6 = tmp2 ^ in5; - out7 = tmp2 ^ in2 ^ in7; - out3 = out6 ^ in3 ^ in7; - out0 = tmp1 ^ out7; - out2 = out3 ^ out4 ^ in0; - out5 = tmp0 ^ out2; - out1 = out0 ^ out5 ^ in5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_EA(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in6 ^ in7; - out5 = in0 ^ in7; - out6 = in0 ^ in1; - out0 = in1 ^ in2 ^ in3; - out2 = in2 ^ in4 ^ in5; - out7 = out6 ^ in2; - out1 = out0 ^ out6 ^ in4; - out3 = out7 ^ in5 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_EB(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in4 ^ in5; - tmp0 = in0 ^ in1; - out4 = in4 ^ in6 ^ in7; - out5 = in0 ^ in5 ^ in7; - out6 = tmp0 ^ in6; - tmp1 = tmp0 ^ in2; - out0 = tmp1 ^ in3; - out7 = tmp1 ^ in7; - out1 = out0 ^ in4; - out3 = out0 ^ in5 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_EC(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out3 = in0 ^ in5; - out4 = in2 ^ in3 ^ in7; - out5 = in0 ^ in3 ^ in4; - out6 = out3 ^ in1 ^ in4; - out1 = out4 ^ in4; - out0 = out4 ^ in1 ^ in6; - out2 = out0 ^ out5 ^ in5; - out7 = out2 ^ in4 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_ED(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in2 ^ in4; - tmp1 = in3 ^ in5; - out4 = tmp0 ^ in3 ^ in7; - out3 = tmp1 ^ in0; - out1 = out4 ^ in1; - out5 = out3 ^ in4; - out7 = out1 ^ out5 ^ in6; - out2 = tmp0 ^ out7; - out0 = tmp1 ^ out7; - out6 = out2 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_EE(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in2; - tmp0 = in0 ^ in1; - out5 = in0 ^ in3; - tmp1 = tmp0 ^ in2; - out6 = tmp0 ^ in4; - tmp2 = tmp1 ^ out5; - out7 = tmp1 ^ in5; - out1 = tmp2 ^ out6 ^ in7; - out0 = tmp2 ^ in6; - tmp3 = out7 ^ in1; - out3 = tmp3 ^ in7; - out2 = tmp3 ^ in4 ^ in6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_EF(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out4 = in2 ^ in4; - tmp0 = in0 ^ in5; - tmp1 = in4 ^ in6; - out5 = tmp0 ^ in3; - out2 = tmp0 ^ tmp1; - out6 = tmp1 ^ in0 ^ in1; - out3 = out5 ^ in2 ^ in7; - out7 = out3 ^ in1 ^ in3; - out0 = out4 ^ out6 ^ in3; - out1 = tmp1 ^ out0 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F0(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = in4 ^ in5; - out2 = tmp0 ^ in6; - out3 = tmp1 ^ in1; - tmp2 = tmp1 ^ in7; - out1 = out2 ^ out3 ^ in3; - tmp3 = tmp0 ^ tmp2; - out0 = tmp3 ^ in3; - out5 = tmp3 ^ in0; - out4 = out1 ^ out5 ^ in4; - out7 = out4 ^ in2; - out6 = tmp2 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F1(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in1 ^ in6; - tmp0 = in3 ^ in5; - out3 = tmp0 ^ in1 ^ in4; - tmp1 = out3 ^ in2; - out1 = tmp1 ^ in6; - tmp2 = tmp1 ^ in0; - tmp3 = out1 ^ in5; - out0 = tmp2 ^ in7; - out6 = tmp2 ^ in4; - out7 = tmp3 ^ in0; - out5 = tmp0 ^ out0; - out4 = tmp3 ^ out5 ^ in1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F2(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in4 ^ in5; - out2 = in2 ^ in6 ^ in7; - tmp1 = tmp0 ^ in1; - tmp2 = tmp1 ^ in2; - out0 = tmp2 ^ in3; - out3 = tmp2 ^ in7; - out5 = out3 ^ in0 ^ in4; - tmp3 = tmp0 ^ out5; - out7 = tmp3 ^ in3; - out4 = tmp3 ^ out2; - out1 = out0 ^ out4 ^ in4; - out6 = tmp1 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F3(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in6 ^ in7; - tmp0 = in0 ^ in1; - out4 = tmp0 ^ in6; - tmp1 = tmp0 ^ in2; - out5 = tmp1 ^ in7; - out6 = tmp1 ^ in3; - out7 = out6 ^ in4; - out0 = out7 ^ in5; - out1 = out0 ^ in6; - out3 = out0 ^ in0 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F4(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0 ^ in1 ^ in2; - tmp0 = out2 ^ in3; - out4 = tmp0 ^ in4; - out5 = out4 ^ in5; - out6 = out5 ^ in6; - out7 = out6 ^ in7; - out0 = out7 ^ in0; - out1 = out0 ^ in1; - out3 = tmp0 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F5(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0 ^ in1; - tmp0 = out2 ^ in2; - out4 = tmp0 ^ in3; - out5 = out4 ^ in4; - out6 = out5 ^ in5; - out7 = out6 ^ in6; - out0 = out7 ^ in7; - out1 = out0 ^ in0; - out3 = tmp0 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F6(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in7; - out2 = tmp0 ^ in2; - out4 = out2 ^ in1 ^ in4; - out7 = out4 ^ in3 ^ in5; - out5 = out7 ^ in4 ^ in7; - out0 = tmp0 ^ out7 ^ in6; - tmp1 = out0 ^ in1; - out6 = out0 ^ in0 ^ in5; - out3 = tmp1 ^ in3; - out1 = tmp0 ^ tmp1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F7(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0 ^ in7; - tmp0 = out2 ^ in1; - out4 = tmp0 ^ in2; - out5 = out4 ^ in3 ^ in7; - out6 = out5 ^ in4; - out7 = out6 ^ in5; - out0 = out7 ^ in6; - out1 = out0 ^ in7; - out3 = tmp0 ^ out1; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F8(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in4; - tmp1 = in3 ^ in5; - tmp2 = tmp0 ^ in6; - out4 = tmp0 ^ tmp1; - out1 = tmp1 ^ in2 ^ in4; - out3 = tmp2 ^ in1; - out5 = out3 ^ in5; - out7 = out1 ^ out5 ^ in7; - out6 = tmp1 ^ out7; - out0 = tmp2 ^ out7; - out2 = out6 ^ in0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_F9(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in3 ^ in5; - tmp1 = in0 ^ in6; - out4 = tmp0 ^ in0; - tmp2 = tmp1 ^ in4; - tmp3 = tmp1 ^ in2; - out5 = tmp2 ^ in1; - out3 = out5 ^ in3; - tmp4 = tmp3 ^ out3; - out1 = tmp4 ^ in5; - out0 = tmp4 ^ in0 ^ in7; - out6 = tmp0 ^ out0 ^ in4; - out7 = tmp2 ^ tmp4; - out2 = tmp3 ^ out6; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_FA(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in1; - tmp1 = tmp0 ^ in2; - tmp2 = tmp0 ^ in5; - tmp3 = tmp1 ^ in7; - out5 = tmp2 ^ in6; - out6 = tmp3 ^ in6; - out7 = tmp3 ^ in3; - out3 = out6 ^ in4; - out2 = tmp1 ^ out5; - out4 = out2 ^ out3 ^ in1; - out0 = out4 ^ out7 ^ in5; - out1 = tmp2 ^ out0; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_FB(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in5 ^ in6; - tmp0 = in0 ^ in1; - out4 = in0 ^ in5 ^ in7; - out5 = tmp0 ^ in6; - tmp1 = tmp0 ^ in2; - out6 = tmp1 ^ in7; - out7 = tmp1 ^ in3; - out0 = out7 ^ in4; - out1 = out0 ^ in5; - out3 = out0 ^ in6 ^ in7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_FC(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in1 ^ in2; - tmp1 = in0 ^ in7; - out2 = tmp0 ^ tmp1 ^ in5; - out3 = tmp1 ^ in4; - tmp2 = out2 ^ in6; - out6 = tmp2 ^ in4; - out7 = tmp2 ^ in3; - out4 = out6 ^ in1 ^ in3; - tmp3 = out4 ^ in0; - out1 = tmp3 ^ in6; - out0 = tmp3 ^ in1 ^ in5; - out5 = tmp0 ^ out4; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_FD(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in5; - tmp1 = in1 ^ in7; - out2 = tmp0 ^ tmp1; - out6 = out2 ^ in2 ^ in4; - tmp2 = out6 ^ in0; - out1 = tmp2 ^ in3; - out0 = tmp0 ^ out1 ^ in6; - out5 = out0 ^ in2; - tmp3 = out5 ^ in1; - out3 = tmp3 ^ in6; - out7 = tmp2 ^ tmp3; - out4 = tmp1 ^ out7; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_FE(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - tmp0 = in0 ^ in2; - out2 = tmp0 ^ in5; - out3 = tmp0 ^ in4; - tmp1 = out3 ^ in6; - out4 = tmp1 ^ in5; - tmp2 = tmp1 ^ in1; - out6 = tmp2 ^ in7; - tmp3 = tmp2 ^ in0; - out0 = tmp3 ^ in3; - tmp4 = out0 ^ out4 ^ in7; - out5 = tmp4 ^ in6; - out7 = tmp4 ^ in2; - out1 = tmp3 ^ out5; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -static void gf8_muladd_FF(uint8_t * out, uint8_t * in, unsigned int width) -{ - unsigned int i; - uint64_t * in_ptr = (uint64_t *)in; - uint64_t * out_ptr = (uint64_t *)out; - - for (i = 0; i < width; i++) - { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - uint64_t tmp0, tmp1, tmp2, tmp3; - - uint64_t in0 = out_ptr[0]; - uint64_t in1 = out_ptr[width]; - uint64_t in2 = out_ptr[width * 2]; - uint64_t in3 = out_ptr[width * 3]; - uint64_t in4 = out_ptr[width * 4]; - uint64_t in5 = out_ptr[width * 5]; - uint64_t in6 = out_ptr[width * 6]; - uint64_t in7 = out_ptr[width * 7]; - - out2 = in0 ^ in5; - tmp0 = in4 ^ in7; - tmp1 = out2 ^ in2; - out4 = tmp1 ^ in6; - out7 = tmp1 ^ in1 ^ in3; - out1 = tmp0 ^ out7; - tmp2 = out1 ^ in5; - out6 = tmp2 ^ in3; - tmp3 = tmp2 ^ in7; - out0 = tmp3 ^ in6; - out3 = tmp3 ^ in1; - out5 = tmp0 ^ out0 ^ in2; - - out_ptr[0] = out0 ^ in_ptr[0]; - out_ptr[width] = out1 ^ in_ptr[width]; - out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; - out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; - out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; - out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; - out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; - out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - - in_ptr++; - out_ptr++; - } -} - -void (* ec_gf_muladd[])(uint8_t * out, uint8_t * in, unsigned int width) = -{ - gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, - gf8_muladd_04, gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, - gf8_muladd_08, gf8_muladd_09, gf8_muladd_0A, gf8_muladd_0B, - gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E, gf8_muladd_0F, - gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13, - gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, - gf8_muladd_18, gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, - gf8_muladd_1C, gf8_muladd_1D, gf8_muladd_1E, gf8_muladd_1F, - gf8_muladd_20, gf8_muladd_21, gf8_muladd_22, gf8_muladd_23, - gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27, - gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, - gf8_muladd_2C, gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, - gf8_muladd_30, gf8_muladd_31, gf8_muladd_32, gf8_muladd_33, - gf8_muladd_34, gf8_muladd_35, gf8_muladd_36, gf8_muladd_37, - gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B, - gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, - gf8_muladd_40, gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, - gf8_muladd_44, gf8_muladd_45, gf8_muladd_46, gf8_muladd_47, - gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A, gf8_muladd_4B, - gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F, - gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, - gf8_muladd_54, gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, - gf8_muladd_58, gf8_muladd_59, gf8_muladd_5A, gf8_muladd_5B, - gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E, gf8_muladd_5F, - gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63, - gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, - gf8_muladd_68, gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, - gf8_muladd_6C, gf8_muladd_6D, gf8_muladd_6E, gf8_muladd_6F, - gf8_muladd_70, gf8_muladd_71, gf8_muladd_72, gf8_muladd_73, - gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77, - gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, - gf8_muladd_7C, gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, - gf8_muladd_80, gf8_muladd_81, gf8_muladd_82, gf8_muladd_83, - gf8_muladd_84, gf8_muladd_85, gf8_muladd_86, gf8_muladd_87, - gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B, - gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, - gf8_muladd_90, gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, - gf8_muladd_94, gf8_muladd_95, gf8_muladd_96, gf8_muladd_97, - gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A, gf8_muladd_9B, - gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F, - gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, - gf8_muladd_A4, gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, - gf8_muladd_A8, gf8_muladd_A9, gf8_muladd_AA, gf8_muladd_AB, - gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE, gf8_muladd_AF, - gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3, - gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, - gf8_muladd_B8, gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, - gf8_muladd_BC, gf8_muladd_BD, gf8_muladd_BE, gf8_muladd_BF, - gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2, gf8_muladd_C3, - gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7, - gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, - gf8_muladd_CC, gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, - gf8_muladd_D0, gf8_muladd_D1, gf8_muladd_D2, gf8_muladd_D3, - gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6, gf8_muladd_D7, - gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB, - gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, - gf8_muladd_E0, gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, - gf8_muladd_E4, gf8_muladd_E5, gf8_muladd_E6, gf8_muladd_E7, - gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA, gf8_muladd_EB, - gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF, - gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, - gf8_muladd_F4, gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, - gf8_muladd_F8, gf8_muladd_F9, gf8_muladd_FA, gf8_muladd_FB, - gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE, gf8_muladd_FF -}; diff --git a/xlators/cluster/ec/src/ec-gf8.c b/xlators/cluster/ec/src/ec-gf8.c new file mode 100644 index 00000000000..2665632706b --- /dev/null +++ b/xlators/cluster/ec/src/ec-gf8.c @@ -0,0 +1,5959 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec-gf8.h" + +static ec_gf_op_t ec_gf8_mul_00_ops[] = { + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_00 = { + 0, + { 0, }, + ec_gf8_mul_00_ops +}; + +static ec_gf_op_t ec_gf8_mul_01_ops[] = { + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_01 = { + 8, + { 0, 1, 2, 3, 4, 5, 6, 7, }, + ec_gf8_mul_01_ops +}; + +static ec_gf_op_t ec_gf8_mul_02_ops[] = { + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_02 = { + 8, + { 7, 0, 1, 2, 3, 4, 5, 6, }, + ec_gf8_mul_02_ops +}; + +static ec_gf_op_t ec_gf8_mul_03_ops[] = { + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_03 = { + 9, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, }, + ec_gf8_mul_03_ops +}; + +static ec_gf_op_t ec_gf8_mul_04_ops[] = { + { EC_GF_OP_XOR3, 8, 6, 7 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_04 = { + 9, + { 6, 7, 0, 1, 2, 3, 4, 5, 8, }, + ec_gf8_mul_04_ops +}; + +static ec_gf_op_t ec_gf8_mul_05_ops[] = { + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_05 = { + 8, + { 0, 1, 2, 6, 7, 3, 4, 5, }, + ec_gf8_mul_05_ops +}; + +static ec_gf_op_t ec_gf8_mul_06_ops[] = { + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_COPY, 8, 2, 0 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_06 = { + 9, + { 7, 0, 1, 2, 8, 3, 4, 5, 6, }, + ec_gf8_mul_06_ops +}; + +static ec_gf_op_t ec_gf8_mul_07_ops[] = { + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_07 = { + 8, + { 6, 0, 1, 3, 2, 4, 5, 7, }, + ec_gf8_mul_07_ops +}; + +static ec_gf_op_t ec_gf8_mul_08_ops[] = { + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR3, 8, 6, 7 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_08 = { + 9, + { 5, 6, 7, 0, 1, 2, 3, 4, 8, }, + ec_gf8_mul_08_ops +}; + +static ec_gf_op_t ec_gf8_mul_09_ops[] = { + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_09 = { + 8, + { 0, 1, 2, 3, 5, 6, 7, 4, }, + ec_gf8_mul_09_ops +}; + +static ec_gf_op_t ec_gf8_mul_0A_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0A = { + 8, + { 5, 0, 1, 2, 6, 7, 3, 4, }, + ec_gf8_mul_0A_ops +}; + +static ec_gf_op_t ec_gf8_mul_0B_ops[] = { + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_COPY, 9, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_COPY, 8, 5, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR3, 3, 8, 6 }, + { EC_GF_OP_XOR2, 1, 9, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0B = { + 10, + { 7, 1, 5, 2, 4, 3, 0, 6, 8, 9, }, + ec_gf8_mul_0B_ops +}; + +static ec_gf_op_t ec_gf8_mul_0C_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 8, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0C = { + 9, + { 5, 7, 0, 1, 8, 2, 3, 4, 6, }, + ec_gf8_mul_0C_ops +}; + +static ec_gf_op_t ec_gf8_mul_0D_ops[] = { + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR3, 8, 2, 4 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR3, 2, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0D = { + 9, + { 5, 6, 7, 3, 1, 0, 2, 4, 8, }, + ec_gf8_mul_0D_ops +}; + +static ec_gf_op_t ec_gf8_mul_0E_ops[] = { + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0E = { + 8, + { 7, 0, 6, 1, 3, 2, 4, 5, }, + ec_gf8_mul_0E_ops +}; + +static ec_gf_op_t ec_gf8_mul_0F_ops[] = { + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0F = { + 8, + { 1, 0, 5, 6, 7, 2, 3, 4, }, + ec_gf8_mul_0F_ops +}; + +static ec_gf_op_t ec_gf8_mul_10_ops[] = { + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_10 = { + 8, + { 4, 5, 6, 7, 0, 1, 2, 3, }, + ec_gf8_mul_10_ops +}; + +static ec_gf_op_t ec_gf8_mul_11_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_11 = { + 8, + { 4, 1, 2, 6, 0, 5, 7, 3, }, + ec_gf8_mul_11_ops +}; + +static ec_gf_op_t ec_gf8_mul_12_ops[] = { + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_12 = { + 8, + { 7, 0, 1, 2, 3, 5, 6, 4, }, + ec_gf8_mul_12_ops +}; + +static ec_gf_op_t ec_gf8_mul_13_ops[] = { + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR3, 8, 3, 7 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_13 = { + 9, + { 4, 5, 2, 6, 0, 1, 7, 3, 8, }, + ec_gf8_mul_13_ops +}; + +static ec_gf_op_t ec_gf8_mul_14_ops[] = { + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_14 = { + 8, + { 6, 7, 0, 1, 2, 4, 5, 3, }, + ec_gf8_mul_14_ops +}; + +static ec_gf_op_t ec_gf8_mul_15_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR3, 5, 8, 7 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_15 = { + 9, + { 0, 1, 2, 4, 7, 6, 5, 3, 8, }, + ec_gf8_mul_15_ops +}; + +static ec_gf_op_t ec_gf8_mul_16_ops[] = { + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_16 = { + 8, + { 6, 7, 4, 1, 2, 3, 5, 0, }, + ec_gf8_mul_16_ops +}; + +static ec_gf_op_t ec_gf8_mul_17_ops[] = { + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_17 = { + 8, + { 5, 7, 0, 1, 3, 2, 4, 6, }, + ec_gf8_mul_17_ops +}; + +static ec_gf_op_t ec_gf8_mul_18_ops[] = { + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_18 = { + 9, + { 4, 5, 7, 6, 0, 1, 2, 3, 8, }, + ec_gf8_mul_18_ops +}; + +static ec_gf_op_t ec_gf8_mul_19_ops[] = { + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_19 = { + 8, + { 0, 5, 2, 6, 7, 1, 3, 4, }, + ec_gf8_mul_19_ops +}; + +static ec_gf_op_t ec_gf8_mul_1A_ops[] = { + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1A = { + 8, + { 7, 0, 4, 5, 3, 1, 2, 6, }, + ec_gf8_mul_1A_ops +}; + +static ec_gf_op_t ec_gf8_mul_1B_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1B = { + 8, + { 7, 4, 5, 6, 3, 1, 2, 0, }, + ec_gf8_mul_1B_ops +}; + +static ec_gf_op_t ec_gf8_mul_1C_ops[] = { + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1C = { + 8, + { 5, 4, 3, 0, 1, 7, 2, 6, }, + ec_gf8_mul_1C_ops +}; + +static ec_gf_op_t ec_gf8_mul_1D_ops[] = { + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR3, 8, 4, 2 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1D = { + 9, + { 0, 7, 5, 8, 2, 3, 4, 1, 6, }, + ec_gf8_mul_1D_ops +}; + +static ec_gf_op_t ec_gf8_mul_1E_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1E = { + 8, + { 4, 7, 5, 1, 6, 0, 2, 3, }, + ec_gf8_mul_1E_ops +}; + +static ec_gf_op_t ec_gf8_mul_1F_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR3, 8, 3, 7 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1F = { + 9, + { 1, 4, 5, 6, 7, 0, 3, 2, 8, }, + ec_gf8_mul_1F_ops +}; + +static ec_gf_op_t ec_gf8_mul_20_ops[] = { + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_20 = { + 8, + { 7, 4, 5, 6, 3, 0, 1, 2, }, + ec_gf8_mul_20_ops +}; + +static ec_gf_op_t ec_gf8_mul_21_ops[] = { + { EC_GF_OP_COPY, 9, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR3, 8, 7, 5 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 4, 9, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_21 = { + 10, + { 0, 1, 2, 7, 5, 4, 3, 6, 8, 9, }, + ec_gf8_mul_21_ops +}; + +static ec_gf_op_t ec_gf8_mul_22_ops[] = { + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_22 = { + 8, + { 3, 0, 5, 2, 6, 4, 1, 7, }, + ec_gf8_mul_22_ops +}; + +static ec_gf_op_t ec_gf8_mul_23_ops[] = { + { EC_GF_OP_COPY, 8, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_23 = { + 9, + { 0, 4, 3, 2, 5, 6, 1, 8, 7, }, + ec_gf8_mul_23_ops +}; + +static ec_gf_op_t ec_gf8_mul_24_ops[] = { + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_24 = { + 8, + { 6, 7, 0, 1, 2, 4, 5, 3, }, + ec_gf8_mul_24_ops +}; + +static ec_gf_op_t ec_gf8_mul_25_ops[] = { + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_25 = { + 8, + { 2, 7, 0, 1, 3, 4, 5, 6, }, + ec_gf8_mul_25_ops +}; + +static ec_gf_op_t ec_gf8_mul_26_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_26 = { + 8, + { 3, 4, 1, 2, 0, 5, 6, 7, }, + ec_gf8_mul_26_ops +}; + +static ec_gf_op_t ec_gf8_mul_27_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_27 = { + 8, + { 3, 0, 1, 2, 6, 7, 4, 5, }, + ec_gf8_mul_27_ops +}; + +static ec_gf_op_t ec_gf8_mul_28_ops[] = { + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_28 = { + 8, + { 5, 6, 3, 0, 1, 2, 4, 7, }, + ec_gf8_mul_28_ops +}; + +static ec_gf_op_t ec_gf8_mul_29_ops[] = { + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_29 = { + 8, + { 4, 6, 3, 5, 7, 0, 1, 2, }, + ec_gf8_mul_29_ops +}; + +static ec_gf_op_t ec_gf8_mul_2A_ops[] = { + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 8, 0, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR3, 6, 8, 4 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2A = { + 9, + { 3, 4, 7, 2, 6, 5, 1, 0, 8, }, + ec_gf8_mul_2A_ops +}; + +static ec_gf_op_t ec_gf8_mul_2B_ops[] = { + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2B = { + 8, + { 3, 4, 7, 5, 6, 0, 1, 2, }, + ec_gf8_mul_2B_ops +}; + +static ec_gf_op_t ec_gf8_mul_2C_ops[] = { + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2C = { + 8, + { 5, 6, 7, 0, 2, 3, 4, 1, }, + ec_gf8_mul_2C_ops +}; + +static ec_gf_op_t ec_gf8_mul_2D_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR3, 8, 4, 6 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2D = { + 9, + { 7, 0, 3, 5, 1, 4, 2, 6, 8, }, + ec_gf8_mul_2D_ops +}; + +static ec_gf_op_t ec_gf8_mul_2E_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_COPY, 8, 4, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 8, 7, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2E = { + 9, + { 5, 0, 7, 3, 2, 6, 4, 1, 8, }, + ec_gf8_mul_2E_ops +}; + +static ec_gf_op_t ec_gf8_mul_2F_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR3, 8, 7, 6 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2F = { + 9, + { 6, 3, 2, 5, 7, 0, 1, 4, 8, }, + ec_gf8_mul_2F_ops +}; + +static ec_gf_op_t ec_gf8_mul_30_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 8, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR3, 6, 8, 7 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_30 = { + 9, + { 3, 4, 7, 5, 0, 6, 1, 2, 8, }, + ec_gf8_mul_30_ops +}; + +static ec_gf_op_t ec_gf8_mul_31_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_31 = { + 8, + { 7, 1, 4, 5, 6, 0, 2, 3, }, + ec_gf8_mul_31_ops +}; + +static ec_gf_op_t ec_gf8_mul_32_ops[] = { + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_32 = { + 8, + { 3, 4, 6, 7, 5, 0, 1, 2, }, + ec_gf8_mul_32_ops +}; + +static ec_gf_op_t ec_gf8_mul_33_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_33 = { + 8, + { 5, 4, 3, 0, 2, 1, 6, 7, }, + ec_gf8_mul_33_ops +}; + +static ec_gf_op_t ec_gf8_mul_34_ops[] = { + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_34 = { + 8, + { 7, 5, 3, 0, 2, 4, 1, 6, }, + ec_gf8_mul_34_ops +}; + +static ec_gf_op_t ec_gf8_mul_35_ops[] = { + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_35 = { + 8, + { 6, 7, 5, 4, 2, 0, 1, 3, }, + ec_gf8_mul_35_ops +}; + +static ec_gf_op_t ec_gf8_mul_36_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_36 = { + 8, + { 6, 7, 4, 1, 2, 3, 0, 5, }, + ec_gf8_mul_36_ops +}; + +static ec_gf_op_t ec_gf8_mul_37_ops[] = { + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR3, 8, 0, 1 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_37 = { + 9, + { 6, 7, 2, 1, 0, 3, 4, 5, 8, }, + ec_gf8_mul_37_ops +}; + +static ec_gf_op_t ec_gf8_mul_38_ops[] = { + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR3, 8, 6, 7 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_38 = { + 9, + { 4, 5, 6, 3, 0, 1, 7, 2, 8, }, + ec_gf8_mul_38_ops +}; + +static ec_gf_op_t ec_gf8_mul_39_ops[] = { + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_39 = { + 8, + { 1, 6, 3, 0, 5, 2, 4, 7, }, + ec_gf8_mul_39_ops +}; + +static ec_gf_op_t ec_gf8_mul_3A_ops[] = { + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3A = { + 8, + { 3, 4, 7, 0, 5, 6, 1, 2, }, + ec_gf8_mul_3A_ops +}; + +static ec_gf_op_t ec_gf8_mul_3B_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR3, 8, 7, 3 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3B = { + 9, + { 3, 0, 1, 7, 6, 2, 4, 8, 5, }, + ec_gf8_mul_3B_ops +}; + +static ec_gf_op_t ec_gf8_mul_3C_ops[] = { + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3C = { + 8, + { 3, 6, 4, 1, 7, 2, 0, 5, }, + ec_gf8_mul_3C_ops +}; + +static ec_gf_op_t ec_gf8_mul_3D_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3D = { + 8, + { 2, 3, 4, 5, 6, 7, 0, 1, }, + ec_gf8_mul_3D_ops +}; + +static ec_gf_op_t ec_gf8_mul_3E_ops[] = { + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3E = { + 8, + { 6, 1, 2, 7, 0, 3, 5, 4, }, + ec_gf8_mul_3E_ops +}; + +static ec_gf_op_t ec_gf8_mul_3F_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_COPY, 10, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_COPY, 9, 2, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR3, 4, 9, 7 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 3, 10, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3F = { + 11, + { 1, 7, 6, 2, 4, 3, 5, 0, 8, 9, 10, }, + ec_gf8_mul_3F_ops +}; + +static ec_gf_op_t ec_gf8_mul_40_ops[] = { + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR3, 8, 7, 6 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_40 = { + 9, + { 5, 7, 4, 6, 2, 3, 0, 1, 8, }, + ec_gf8_mul_40_ops +}; + +static ec_gf_op_t ec_gf8_mul_41_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 8, 4, 0 }, + { EC_GF_OP_XOR2, 8, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_41 = { + 9, + { 0, 7, 6, 5, 3, 4, 8, 1, 2, }, + ec_gf8_mul_41_ops +}; + +static ec_gf_op_t ec_gf8_mul_42_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_42 = { + 9, + { 2, 7, 1, 6, 4, 3, 0, 5, 8, }, + ec_gf8_mul_42_ops +}; + +static ec_gf_op_t ec_gf8_mul_43_ops[] = { + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_43 = { + 8, + { 2, 6, 4, 1, 7, 3, 0, 5, }, + ec_gf8_mul_43_ops +}; + +static ec_gf_op_t ec_gf8_mul_44_ops[] = { + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_44 = { + 8, + { 2, 3, 4, 1, 6, 5, 0, 7, }, + ec_gf8_mul_44_ops +}; + +static ec_gf_op_t ec_gf8_mul_45_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_45 = { + 8, + { 2, 3, 0, 1, 7, 4, 5, 6, }, + ec_gf8_mul_45_ops +}; + +static ec_gf_op_t ec_gf8_mul_46_ops[] = { + { EC_GF_OP_XOR3, 8, 2, 4 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 8, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_46 = { + 9, + { 2, 0, 1, 3, 4, 5, 6, 7, 8, }, + ec_gf8_mul_46_ops +}; + +static ec_gf_op_t ec_gf8_mul_47_ops[] = { + { EC_GF_OP_XOR3, 8, 0, 1 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_47 = { + 9, + { 2, 3, 4, 5, 6, 7, 0, 1, 8, }, + ec_gf8_mul_47_ops +}; + +static ec_gf_op_t ec_gf8_mul_48_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_48 = { + 8, + { 4, 5, 6, 0, 1, 3, 7, 2, }, + ec_gf8_mul_48_ops +}; + +static ec_gf_op_t ec_gf8_mul_49_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR3, 8, 0, 6 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR3, 1, 8, 5 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_49 = { + 9, + { 7, 2, 4, 0, 3, 5, 1, 6, 8, }, + ec_gf8_mul_49_ops +}; + +static ec_gf_op_t ec_gf8_mul_4A_ops[] = { + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4A = { + 8, + { 5, 6, 7, 0, 1, 3, 4, 2, }, + ec_gf8_mul_4A_ops +}; + +static ec_gf_op_t ec_gf8_mul_4B_ops[] = { + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR3, 8, 3, 7 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4B = { + 9, + { 5, 3, 6, 7, 0, 2, 4, 1, 8, }, + ec_gf8_mul_4B_ops +}; + +static ec_gf_op_t ec_gf8_mul_4C_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4C = { + 8, + { 5, 3, 4, 7, 0, 6, 2, 1, }, + ec_gf8_mul_4C_ops +}; + +static ec_gf_op_t ec_gf8_mul_4D_ops[] = { + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR3, 9, 3, 1 }, + { EC_GF_OP_XOR2, 5, 9, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR3, 0, 8, 2 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4D = { + 10, + { 0, 9, 3, 5, 6, 4, 7, 1, 2, 8, }, + ec_gf8_mul_4D_ops +}; + +static ec_gf_op_t ec_gf8_mul_4E_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4E = { + 8, + { 2, 3, 0, 1, 5, 6, 7, 4, }, + ec_gf8_mul_4E_ops +}; + +static ec_gf_op_t ec_gf8_mul_4F_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4F = { + 8, + { 0, 3, 5, 6, 1, 2, 7, 4, }, + ec_gf8_mul_4F_ops +}; + +static ec_gf_op_t ec_gf8_mul_50_ops[] = { + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_50 = { + 8, + { 4, 5, 7, 3, 0, 1, 2, 6, }, + ec_gf8_mul_50_ops +}; + +static ec_gf_op_t ec_gf8_mul_51_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_51 = { + 8, + { 0, 1, 7, 2, 3, 4, 5, 6, }, + ec_gf8_mul_51_ops +}; + +static ec_gf_op_t ec_gf8_mul_52_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_COPY, 9, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR3, 3, 5, 8 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 2, 9, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_52 = { + 10, + { 2, 3, 1, 4, 6, 7, 0, 5, 8, 9, }, + ec_gf8_mul_52_ops +}; + +static ec_gf_op_t ec_gf8_mul_53_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_53 = { + 8, + { 2, 0, 1, 4, 5, 6, 7, 3, }, + ec_gf8_mul_53_ops +}; + +static ec_gf_op_t ec_gf8_mul_54_ops[] = { + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_54 = { + 8, + { 7, 3, 0, 4, 2, 6, 5, 1, }, + ec_gf8_mul_54_ops +}; + +static ec_gf_op_t ec_gf8_mul_55_ops[] = { + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_55 = { + 8, + { 1, 5, 6, 4, 3, 7, 2, 0, }, + ec_gf8_mul_55_ops +}; + +static ec_gf_op_t ec_gf8_mul_56_ops[] = { + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_56 = { + 8, + { 2, 3, 0, 4, 5, 6, 7, 1, }, + ec_gf8_mul_56_ops +}; + +static ec_gf_op_t ec_gf8_mul_57_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_57 = { + 8, + { 2, 3, 0, 1, 4, 5, 6, 7, }, + ec_gf8_mul_57_ops +}; + +static ec_gf_op_t ec_gf8_mul_58_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_58 = { + 8, + { 4, 3, 2, 7, 0, 1, 5, 6, }, + ec_gf8_mul_58_ops +}; + +static ec_gf_op_t ec_gf8_mul_59_ops[] = { + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_59 = { + 8, + { 7, 3, 5, 6, 1, 2, 0, 4, }, + ec_gf8_mul_59_ops +}; + +static ec_gf_op_t ec_gf8_mul_5A_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5A = { + 8, + { 6, 7, 0, 1, 2, 3, 5, 4, }, + ec_gf8_mul_5A_ops +}; + +static ec_gf_op_t ec_gf8_mul_5B_ops[] = { + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5B = { + 8, + { 6, 0, 7, 5, 2, 1, 3, 4, }, + ec_gf8_mul_5B_ops +}; + +static ec_gf_op_t ec_gf8_mul_5C_ops[] = { + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5C = { + 9, + { 7, 5, 2, 4, 1, 0, 6, 3, 8, }, + ec_gf8_mul_5C_ops +}; + +static ec_gf_op_t ec_gf8_mul_5D_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5D = { + 8, + { 1, 3, 5, 4, 6, 7, 2, 0, }, + ec_gf8_mul_5D_ops +}; + +static ec_gf_op_t ec_gf8_mul_5E_ops[] = { + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5E = { + 8, + { 4, 3, 6, 2, 5, 7, 0, 1, }, + ec_gf8_mul_5E_ops +}; + +static ec_gf_op_t ec_gf8_mul_5F_ops[] = { + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5F = { + 8, + { 6, 1, 3, 4, 5, 7, 2, 0, }, + ec_gf8_mul_5F_ops +}; + +static ec_gf_op_t ec_gf8_mul_60_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_60 = { + 8, + { 2, 3, 4, 7, 5, 6, 0, 1, }, + ec_gf8_mul_60_ops +}; + +static ec_gf_op_t ec_gf8_mul_61_ops[] = { + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_61 = { + 8, + { 0, 5, 6, 7, 4, 2, 1, 3, }, + ec_gf8_mul_61_ops +}; + +static ec_gf_op_t ec_gf8_mul_62_ops[] = { + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_62 = { + 8, + { 2, 0, 3, 4, 5, 6, 7, 1, }, + ec_gf8_mul_62_ops +}; + +static ec_gf_op_t ec_gf8_mul_63_ops[] = { + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_63 = { + 8, + { 3, 4, 6, 5, 7, 0, 1, 2, }, + ec_gf8_mul_63_ops +}; + +static ec_gf_op_t ec_gf8_mul_64_ops[] = { + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 8, 0, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 8, 7, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_64 = { + 9, + { 2, 3, 4, 6, 5, 7, 8, 1, 0, }, + ec_gf8_mul_64_ops +}; + +static ec_gf_op_t ec_gf8_mul_65_ops[] = { + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_65 = { + 8, + { 2, 5, 1, 3, 4, 0, 6, 7, }, + ec_gf8_mul_65_ops +}; + +static ec_gf_op_t ec_gf8_mul_66_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_66 = { + 8, + { 2, 3, 1, 4, 5, 7, 0, 6, }, + ec_gf8_mul_66_ops +}; + +static ec_gf_op_t ec_gf8_mul_67_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_67 = { + 8, + { 2, 4, 5, 6, 7, 3, 1, 0, }, + ec_gf8_mul_67_ops +}; + +static ec_gf_op_t ec_gf8_mul_68_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_68 = { + 8, + { 5, 7, 2, 3, 0, 6, 4, 1, }, + ec_gf8_mul_68_ops +}; + +static ec_gf_op_t ec_gf8_mul_69_ops[] = { + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_69 = { + 8, + { 0, 1, 3, 2, 4, 5, 7, 6, }, + ec_gf8_mul_69_ops +}; + +static ec_gf_op_t ec_gf8_mul_6A_ops[] = { + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6A = { + 8, + { 5, 7, 4, 6, 1, 2, 0, 3, }, + ec_gf8_mul_6A_ops +}; + +static ec_gf_op_t ec_gf8_mul_6B_ops[] = { + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6B = { + 9, + { 6, 7, 2, 0, 3, 1, 5, 4, 8, }, + ec_gf8_mul_6B_ops +}; + +static ec_gf_op_t ec_gf8_mul_6C_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6C = { + 8, + { 5, 6, 7, 0, 1, 2, 3, 4, }, + ec_gf8_mul_6C_ops +}; + +static ec_gf_op_t ec_gf8_mul_6D_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR3, 8, 3, 4 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6D = { + 9, + { 3, 6, 7, 0, 4, 5, 1, 2, 8, }, + ec_gf8_mul_6D_ops +}; + +static ec_gf_op_t ec_gf8_mul_6E_ops[] = { + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6E = { + 8, + { 5, 6, 3, 1, 7, 2, 0, 4, }, + ec_gf8_mul_6E_ops +}; + +static ec_gf_op_t ec_gf8_mul_6F_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR3, 0, 8, 7 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6F = { + 9, + { 2, 6, 3, 7, 0, 1, 4, 5, 8, }, + ec_gf8_mul_6F_ops +}; + +static ec_gf_op_t ec_gf8_mul_70_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_70 = { + 8, + { 3, 4, 5, 2, 6, 0, 1, 7, }, + ec_gf8_mul_70_ops +}; + +static ec_gf_op_t ec_gf8_mul_71_ops[] = { + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_71 = { + 8, + { 4, 7, 5, 3, 6, 0, 2, 1, }, + ec_gf8_mul_71_ops +}; + +static ec_gf_op_t ec_gf8_mul_72_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_72 = { + 8, + { 0, 5, 2, 7, 4, 1, 3, 6, }, + ec_gf8_mul_72_ops +}; + +static ec_gf_op_t ec_gf8_mul_73_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_73 = { + 8, + { 6, 0, 1, 7, 4, 5, 2, 3, }, + ec_gf8_mul_73_ops +}; + +static ec_gf_op_t ec_gf8_mul_74_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_74 = { + 8, + { 3, 2, 1, 0, 4, 5, 6, 7, }, + ec_gf8_mul_74_ops +}; + +static ec_gf_op_t ec_gf8_mul_75_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_75 = { + 8, + { 4, 5, 6, 7, 0, 1, 2, 3, }, + ec_gf8_mul_75_ops +}; + +static ec_gf_op_t ec_gf8_mul_76_ops[] = { + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR3, 8, 6, 2 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_76 = { + 9, + { 2, 3, 0, 6, 5, 1, 7, 8, 4, }, + ec_gf8_mul_76_ops +}; + +static ec_gf_op_t ec_gf8_mul_77_ops[] = { + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_77 = { + 8, + { 7, 4, 3, 6, 0, 1, 5, 2, }, + ec_gf8_mul_77_ops +}; + +static ec_gf_op_t ec_gf8_mul_78_ops[] = { + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR3, 8, 0, 2 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_78 = { + 9, + { 4, 7, 3, 2, 5, 1, 6, 0, 8, }, + ec_gf8_mul_78_ops +}; + +static ec_gf_op_t ec_gf8_mul_79_ops[] = { + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR3, 8, 4, 7 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_79 = { + 9, + { 4, 5, 7, 3, 1, 6, 2, 0, 8, }, + ec_gf8_mul_79_ops +}; + +static ec_gf_op_t ec_gf8_mul_7A_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7A = { + 8, + { 1, 2, 3, 4, 5, 6, 7, 0, }, + ec_gf8_mul_7A_ops +}; + +static ec_gf_op_t ec_gf8_mul_7B_ops[] = { + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR3, 8, 5, 3 }, + { EC_GF_OP_XOR2, 8, 0, 0 }, + { EC_GF_OP_COPY, 9, 4, 0 }, + { EC_GF_OP_XOR2, 8, 2, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR3, 4, 1, 9 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7B = { + 10, + { 1, 2, 3, 4, 8, 5, 6, 0, 7, 9, }, + ec_gf8_mul_7B_ops +}; + +static ec_gf_op_t ec_gf8_mul_7C_ops[] = { + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7C = { + 8, + { 2, 4, 1, 6, 3, 5, 7, 0, }, + ec_gf8_mul_7C_ops +}; + +static ec_gf_op_t ec_gf8_mul_7D_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7D = { + 8, + { 1, 0, 3, 5, 6, 7, 2, 4, }, + ec_gf8_mul_7D_ops +}; + +static ec_gf_op_t ec_gf8_mul_7E_ops[] = { + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR3, 6, 2, 7 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7E = { + 9, + { 5, 1, 2, 0, 7, 3, 4, 6, 8, }, + ec_gf8_mul_7E_ops +}; + +static ec_gf_op_t ec_gf8_mul_7F_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR3, 9, 7, 5 }, + { EC_GF_OP_XOR2, 2, 9, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 9, 0 }, + { EC_GF_OP_XOR3, 9, 6, 4 }, + { EC_GF_OP_XOR2, 7, 9, 0 }, + { EC_GF_OP_XOR2, 3, 9, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7F = { + 10, + { 4, 1, 0, 5, 6, 7, 2, 3, 8, 9, }, + ec_gf8_mul_7F_ops +}; + +static ec_gf_op_t ec_gf8_mul_80_ops[] = { + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_80 = { + 8, + { 7, 5, 6, 4, 1, 2, 3, 0, }, + ec_gf8_mul_80_ops +}; + +static ec_gf_op_t ec_gf8_mul_81_ops[] = { + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_81 = { + 8, + { 2, 7, 4, 1, 5, 6, 3, 0, }, + ec_gf8_mul_81_ops +}; + +static ec_gf_op_t ec_gf8_mul_82_ops[] = { + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_COPY, 8, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR3, 5, 8, 7 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_82 = { + 9, + { 6, 2, 7, 5, 1, 3, 4, 0, 8, }, + ec_gf8_mul_82_ops +}; + +static ec_gf_op_t ec_gf8_mul_83_ops[] = { + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_83 = { + 8, + { 3, 5, 6, 7, 1, 2, 4, 0, }, + ec_gf8_mul_83_ops +}; + +static ec_gf_op_t ec_gf8_mul_84_ops[] = { + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_84 = { + 8, + { 7, 6, 0, 4, 1, 5, 3, 2, }, + ec_gf8_mul_84_ops +}; + +static ec_gf_op_t ec_gf8_mul_85_ops[] = { + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_85 = { + 8, + { 7, 6, 0, 3, 2, 4, 5, 1, }, + ec_gf8_mul_85_ops +}; + +static ec_gf_op_t ec_gf8_mul_86_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_86 = { + 8, + { 1, 2, 6, 4, 5, 7, 3, 0, }, + ec_gf8_mul_86_ops +}; + +static ec_gf_op_t ec_gf8_mul_87_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 8, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR3, 5, 8, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_87 = { + 9, + { 1, 2, 3, 4, 5, 7, 6, 0, 8, }, + ec_gf8_mul_87_ops +}; + +static ec_gf_op_t ec_gf8_mul_88_ops[] = { + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_88 = { + 8, + { 6, 7, 3, 1, 2, 4, 5, 0, }, + ec_gf8_mul_88_ops +}; + +static ec_gf_op_t ec_gf8_mul_89_ops[] = { + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR3, 8, 5, 2 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_89 = { + 9, + { 2, 1, 6, 5, 7, 3, 4, 0, 8, }, + ec_gf8_mul_89_ops +}; + +static ec_gf_op_t ec_gf8_mul_8A_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8A = { + 8, + { 1, 2, 3, 0, 6, 7, 4, 5, }, + ec_gf8_mul_8A_ops +}; + +static ec_gf_op_t ec_gf8_mul_8B_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8B = { + 8, + { 6, 1, 2, 3, 5, 7, 4, 0, }, + ec_gf8_mul_8B_ops +}; + +static ec_gf_op_t ec_gf8_mul_8C_ops[] = { + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8C = { + 8, + { 1, 2, 0, 7, 3, 4, 5, 6, }, + ec_gf8_mul_8C_ops +}; + +static ec_gf_op_t ec_gf8_mul_8D_ops[] = { + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8D = { + 8, + { 7, 1, 3, 2, 4, 5, 0, 6, }, + ec_gf8_mul_8D_ops +}; + +static ec_gf_op_t ec_gf8_mul_8E_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8E = { + 8, + { 1, 2, 3, 4, 5, 6, 7, 0, }, + ec_gf8_mul_8E_ops +}; + +static ec_gf_op_t ec_gf8_mul_8F_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8F = { + 8, + { 1, 2, 3, 4, 5, 6, 7, 0, }, + ec_gf8_mul_8F_ops +}; + +static ec_gf_op_t ec_gf8_mul_90_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_90 = { + 8, + { 4, 5, 6, 7, 0, 1, 3, 2, }, + ec_gf8_mul_90_ops +}; + +static ec_gf_op_t ec_gf8_mul_91_ops[] = { + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_COPY, 9, 1, 0 }, + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 9, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR3, 5, 8, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_91 = { + 10, + { 2, 3, 1, 4, 0, 6, 7, 5, 8, 9, }, + ec_gf8_mul_91_ops +}; + +static ec_gf_op_t ec_gf8_mul_92_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_92 = { + 8, + { 6, 7, 0, 1, 2, 3, 5, 4, }, + ec_gf8_mul_92_ops +}; + +static ec_gf_op_t ec_gf8_mul_93_ops[] = { + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_93 = { + 8, + { 6, 4, 5, 1, 7, 2, 3, 0, }, + ec_gf8_mul_93_ops +}; + +static ec_gf_op_t ec_gf8_mul_94_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_94 = { + 8, + { 7, 5, 0, 2, 6, 1, 3, 4, }, + ec_gf8_mul_94_ops +}; + +static ec_gf_op_t ec_gf8_mul_95_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_95 = { + 8, + { 7, 6, 1, 3, 0, 4, 5, 2, }, + ec_gf8_mul_95_ops +}; + +static ec_gf_op_t ec_gf8_mul_96_ops[] = { + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR3, 8, 0, 4 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_96 = { + 9, + { 4, 0, 1, 6, 7, 2, 3, 5, 8, }, + ec_gf8_mul_96_ops +}; + +static ec_gf_op_t ec_gf8_mul_97_ops[] = { + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_COPY, 8, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 8, 6, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_97 = { + 9, + { 4, 5, 3, 6, 7, 1, 2, 0, 8, }, + ec_gf8_mul_97_ops +}; + +static ec_gf_op_t ec_gf8_mul_98_ops[] = { + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_98 = { + 8, + { 4, 2, 3, 6, 7, 5, 1, 0, }, + ec_gf8_mul_98_ops +}; + +static ec_gf_op_t ec_gf8_mul_99_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_99 = { + 8, + { 6, 5, 3, 7, 0, 1, 4, 2, }, + ec_gf8_mul_99_ops +}; + +static ec_gf_op_t ec_gf8_mul_9A_ops[] = { + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR3, 8, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9A = { + 9, + { 6, 3, 4, 0, 5, 1, 2, 7, 8, }, + ec_gf8_mul_9A_ops +}; + +static ec_gf_op_t ec_gf8_mul_9B_ops[] = { + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_COPY, 9, 5, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR3, 8, 3, 2 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 3, 9, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9B = { + 10, + { 4, 5, 8, 6, 7, 1, 2, 0, 3, 9, }, + ec_gf8_mul_9B_ops +}; + +static ec_gf_op_t ec_gf8_mul_9C_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9C = { + 8, + { 3, 2, 1, 0, 4, 5, 6, 7, }, + ec_gf8_mul_9C_ops +}; + +static ec_gf_op_t ec_gf8_mul_9D_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9D = { + 8, + { 0, 1, 2, 3, 7, 4, 5, 6, }, + ec_gf8_mul_9D_ops +}; + +static ec_gf_op_t ec_gf8_mul_9E_ops[] = { + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_COPY, 8, 7, 0 }, + { EC_GF_OP_XOR2, 8, 5, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9E = { + 9, + { 4, 5, 3, 8, 6, 0, 2, 7, 1, }, + ec_gf8_mul_9E_ops +}; + +static ec_gf_op_t ec_gf8_mul_9F_ops[] = { + { EC_GF_OP_XOR3, 8, 1, 2 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9F = { + 9, + { 4, 5, 6, 7, 0, 1, 2, 3, 8, }, + ec_gf8_mul_9F_ops +}; + +static ec_gf_op_t ec_gf8_mul_A0_ops[] = { + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A0 = { + 8, + { 3, 1, 6, 7, 5, 2, 4, 0, }, + ec_gf8_mul_A0_ops +}; + +static ec_gf_op_t ec_gf8_mul_A1_ops[] = { + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR3, 8, 0, 6 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A1 = { + 9, + { 7, 4, 1, 5, 6, 0, 2, 3, 8, }, + ec_gf8_mul_A1_ops +}; + +static ec_gf_op_t ec_gf8_mul_A2_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A2 = { + 8, + { 7, 0, 6, 3, 2, 1, 4, 5, }, + ec_gf8_mul_A2_ops +}; + +static ec_gf_op_t ec_gf8_mul_A3_ops[] = { + { EC_GF_OP_COPY, 8, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A3 = { + 9, + { 3, 7, 2, 6, 1, 4, 0, 5, 8, }, + ec_gf8_mul_A3_ops +}; + +static ec_gf_op_t ec_gf8_mul_A4_ops[] = { + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A4 = { + 8, + { 5, 6, 7, 2, 4, 3, 0, 1, }, + ec_gf8_mul_A4_ops +}; + +static ec_gf_op_t ec_gf8_mul_A5_ops[] = { + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR3, 8, 5, 6 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A5 = { + 9, + { 1, 4, 2, 5, 6, 7, 3, 0, 8, }, + ec_gf8_mul_A5_ops +}; + +static ec_gf_op_t ec_gf8_mul_A6_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A6 = { + 8, + { 1, 2, 0, 3, 4, 5, 6, 7, }, + ec_gf8_mul_A6_ops +}; + +static ec_gf_op_t ec_gf8_mul_A7_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A7 = { + 8, + { 0, 1, 2, 5, 6, 7, 3, 4, }, + ec_gf8_mul_A7_ops +}; + +static ec_gf_op_t ec_gf8_mul_A8_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 8, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_COPY, 9, 4, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 9, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A8 = { + 10, + { 1, 7, 5, 8, 6, 3, 4, 0, 2, 9, }, + ec_gf8_mul_A8_ops +}; + +static ec_gf_op_t ec_gf8_mul_A9_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A9 = { + 8, + { 3, 7, 6, 1, 2, 0, 4, 5, }, + ec_gf8_mul_A9_ops +}; + +static ec_gf_op_t ec_gf8_mul_AA_ops[] = { + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AA = { + 8, + { 0, 4, 5, 3, 6, 7, 1, 2, }, + ec_gf8_mul_AA_ops +}; + +static ec_gf_op_t ec_gf8_mul_AB_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_COPY, 9, 6, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 8, 7, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR3, 3, 9, 7 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AB = { + 10, + { 2, 3, 8, 0, 5, 6, 1, 4, 7, 9, }, + ec_gf8_mul_AB_ops +}; + +static ec_gf_op_t ec_gf8_mul_AC_ops[] = { + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AC = { + 8, + { 3, 2, 1, 0, 4, 5, 6, 7, }, + ec_gf8_mul_AC_ops +}; + +static ec_gf_op_t ec_gf8_mul_AD_ops[] = { + { EC_GF_OP_XOR3, 8, 1, 2 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AD = { + 9, + { 3, 4, 5, 6, 7, 0, 1, 2, 8, }, + ec_gf8_mul_AD_ops +}; + +static ec_gf_op_t ec_gf8_mul_AE_ops[] = { + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_COPY, 8, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AE = { + 9, + { 7, 0, 5, 6, 3, 4, 1, 2, 8, }, + ec_gf8_mul_AE_ops +}; + +static ec_gf_op_t ec_gf8_mul_AF_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AF = { + 8, + { 0, 1, 2, 7, 3, 4, 5, 6, }, + ec_gf8_mul_AF_ops +}; + +static ec_gf_op_t ec_gf8_mul_B0_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B0 = { + 8, + { 4, 0, 7, 2, 3, 1, 6, 5, }, + ec_gf8_mul_B0_ops +}; + +static ec_gf_op_t ec_gf8_mul_B1_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_COPY, 8, 4, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR3, 5, 8, 1 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B1 = { + 9, + { 2, 6, 4, 7, 0, 1, 3, 5, 8, }, + ec_gf8_mul_B1_ops +}; + +static ec_gf_op_t ec_gf8_mul_B2_ops[] = { + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR3, 8, 4, 5 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 8, 1, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B2 = { + 9, + { 0, 7, 4, 5, 6, 1, 2, 3, 8, }, + ec_gf8_mul_B2_ops +}; + +static ec_gf_op_t ec_gf8_mul_B3_ops[] = { + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_COPY, 9, 5, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR3, 8, 6, 4 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 8, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR3, 1, 9, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B3 = { + 10, + { 2, 3, 4, 5, 1, 6, 0, 7, 8, 9, }, + ec_gf8_mul_B3_ops +}; + +static ec_gf_op_t ec_gf8_mul_B4_ops[] = { + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B4 = { + 8, + { 5, 6, 7, 0, 1, 2, 3, 4, }, + ec_gf8_mul_B4_ops +}; + +static ec_gf_op_t ec_gf8_mul_B5_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_COPY, 8, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR3, 4, 8, 3 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B5 = { + 9, + { 3, 4, 0, 7, 1, 5, 6, 2, 8, }, + ec_gf8_mul_B5_ops +}; + +static ec_gf_op_t ec_gf8_mul_B6_ops[] = { + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B6 = { + 8, + { 5, 3, 6, 4, 7, 0, 1, 2, }, + ec_gf8_mul_B6_ops +}; + +static ec_gf_op_t ec_gf8_mul_B7_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B7 = { + 8, + { 5, 0, 1, 4, 2, 6, 7, 3, }, + ec_gf8_mul_B7_ops +}; + +static ec_gf_op_t ec_gf8_mul_B8_ops[] = { + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B8 = { + 8, + { 6, 4, 5, 1, 2, 0, 7, 3, }, + ec_gf8_mul_B8_ops +}; + +static ec_gf_op_t ec_gf8_mul_B9_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR3, 0, 8, 2 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B9 = { + 9, + { 6, 7, 0, 2, 1, 4, 5, 3, 8, }, + ec_gf8_mul_B9_ops +}; + +static ec_gf_op_t ec_gf8_mul_BA_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BA = { + 8, + { 1, 2, 4, 3, 5, 6, 0, 7, }, + ec_gf8_mul_BA_ops +}; + +static ec_gf_op_t ec_gf8_mul_BB_ops[] = { + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 8, 5, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 8, 7, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BB = { + 9, + { 7, 2, 1, 8, 3, 5, 6, 4, 0, }, + ec_gf8_mul_BB_ops +}; + +static ec_gf_op_t ec_gf8_mul_BC_ops[] = { + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 8, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR3, 2, 8, 4 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BC = { + 9, + { 2, 6, 3, 4, 5, 1, 7, 0, 8, }, + ec_gf8_mul_BC_ops +}; + +static ec_gf_op_t ec_gf8_mul_BD_ops[] = { + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BD = { + 8, + { 4, 5, 0, 2, 7, 1, 6, 3, }, + ec_gf8_mul_BD_ops +}; + +static ec_gf_op_t ec_gf8_mul_BE_ops[] = { + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BE = { + 8, + { 0, 6, 7, 4, 5, 1, 3, 2, }, + ec_gf8_mul_BE_ops +}; + +static ec_gf_op_t ec_gf8_mul_BF_ops[] = { + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BF = { + 8, + { 5, 6, 1, 7, 3, 0, 2, 4, }, + ec_gf8_mul_BF_ops +}; + +static ec_gf_op_t ec_gf8_mul_C0_ops[] = { + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C0 = { + 8, + { 1, 2, 3, 4, 7, 5, 6, 0, }, + ec_gf8_mul_C0_ops +}; + +static ec_gf_op_t ec_gf8_mul_C1_ops[] = { + { EC_GF_OP_XOR3, 8, 1, 2 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C1 = { + 9, + { 5, 6, 7, 4, 1, 2, 3, 0, 8, }, + ec_gf8_mul_C1_ops +}; + +static ec_gf_op_t ec_gf8_mul_C2_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C2 = { + 8, + { 7, 6, 3, 0, 1, 4, 5, 2, }, + ec_gf8_mul_C2_ops +}; + +static ec_gf_op_t ec_gf8_mul_C3_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR3, 0, 2, 6 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR3, 9, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 7, 9, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C3 = { + 10, + { 5, 6, 4, 7, 1, 2, 3, 0, 8, 9, }, + ec_gf8_mul_C3_ops +}; + +static ec_gf_op_t ec_gf8_mul_C4_ops[] = { + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C4 = { + 8, + { 0, 2, 1, 3, 4, 5, 6, 7, }, + ec_gf8_mul_C4_ops +}; + +static ec_gf_op_t ec_gf8_mul_C5_ops[] = { + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C5 = { + 8, + { 4, 3, 5, 7, 6, 2, 0, 1, }, + ec_gf8_mul_C5_ops +}; + +static ec_gf_op_t ec_gf8_mul_C6_ops[] = { + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_COPY, 8, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR3, 9, 5, 4 }, + { EC_GF_OP_XOR2, 6, 9, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 7, 9, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C6 = { + 10, + { 6, 3, 0, 4, 5, 7, 2, 1, 8, 9, }, + ec_gf8_mul_C6_ops +}; + +static ec_gf_op_t ec_gf8_mul_C7_ops[] = { + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C7 = { + 8, + { 7, 0, 6, 2, 5, 3, 4, 1, }, + ec_gf8_mul_C7_ops +}; + +static ec_gf_op_t ec_gf8_mul_C8_ops[] = { + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C8 = { + 8, + { 1, 3, 2, 4, 6, 7, 5, 0, }, + ec_gf8_mul_C8_ops +}; + +static ec_gf_op_t ec_gf8_mul_C9_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C9 = { + 8, + { 2, 3, 4, 5, 6, 7, 0, 1, }, + ec_gf8_mul_C9_ops +}; + +static ec_gf_op_t ec_gf8_mul_CA_ops[] = { + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CA = { + 8, + { 1, 2, 5, 7, 3, 4, 0, 6, }, + ec_gf8_mul_CA_ops +}; + +static ec_gf_op_t ec_gf8_mul_CB_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CB = { + 8, + { 2, 3, 4, 5, 7, 6, 0, 1, }, + ec_gf8_mul_CB_ops +}; + +static ec_gf_op_t ec_gf8_mul_CC_ops[] = { + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CC = { + 8, + { 2, 7, 1, 0, 5, 6, 3, 4, }, + ec_gf8_mul_CC_ops +}; + +static ec_gf_op_t ec_gf8_mul_CD_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CD = { + 8, + { 0, 6, 1, 2, 7, 3, 4, 5, }, + ec_gf8_mul_CD_ops +}; + +static ec_gf_op_t ec_gf8_mul_CE_ops[] = { + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_COPY, 8, 7, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR3, 3, 6, 8 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR3, 8, 2, 3 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CE = { + 9, + { 5, 7, 3, 0, 2, 6, 4, 1, 8, }, + ec_gf8_mul_CE_ops +}; + +static ec_gf_op_t ec_gf8_mul_CF_ops[] = { + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CF = { + 8, + { 3, 6, 7, 0, 2, 4, 5, 1, }, + ec_gf8_mul_CF_ops +}; + +static ec_gf_op_t ec_gf8_mul_D0_ops[] = { + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D0 = { + 8, + { 5, 6, 7, 2, 0, 3, 1, 4, }, + ec_gf8_mul_D0_ops +}; + +static ec_gf_op_t ec_gf8_mul_D1_ops[] = { + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR3, 8, 6, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D1 = { + 9, + { 5, 6, 3, 2, 0, 7, 4, 1, 8, }, + ec_gf8_mul_D1_ops +}; + +static ec_gf_op_t ec_gf8_mul_D2_ops[] = { + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D2 = { + 8, + { 7, 0, 2, 1, 3, 4, 6, 5, }, + ec_gf8_mul_D2_ops +}; + +static ec_gf_op_t ec_gf8_mul_D3_ops[] = { + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_COPY, 8, 4, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 8, 6, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D3 = { + 9, + { 0, 3, 2, 8, 4, 6, 7, 1, 5, }, + ec_gf8_mul_D3_ops +}; + +static ec_gf_op_t ec_gf8_mul_D4_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR3, 1, 7, 8 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D4 = { + 9, + { 4, 1, 7, 5, 0, 6, 3, 2, 8, }, + ec_gf8_mul_D4_ops +}; + +static ec_gf_op_t ec_gf8_mul_D5_ops[] = { + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D5 = { + 8, + { 6, 7, 4, 5, 2, 3, 1, 0, }, + ec_gf8_mul_D5_ops +}; + +static ec_gf_op_t ec_gf8_mul_D6_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D6 = { + 9, + { 0, 6, 2, 7, 1, 3, 4, 5, 8, }, + ec_gf8_mul_D6_ops +}; + +static ec_gf_op_t ec_gf8_mul_D7_ops[] = { + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR3, 8, 3, 5 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR3, 6, 7, 8 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D7 = { + 9, + { 3, 4, 6, 5, 0, 7, 1, 2, 8, }, + ec_gf8_mul_D7_ops +}; + +static ec_gf_op_t ec_gf8_mul_D8_ops[] = { + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D8 = { + 8, + { 4, 5, 6, 7, 0, 1, 2, 3, }, + ec_gf8_mul_D8_ops +}; + +static ec_gf_op_t ec_gf8_mul_D9_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D9 = { + 8, + { 1, 2, 6, 7, 4, 5, 0, 3, }, + ec_gf8_mul_D9_ops +}; + +static ec_gf_op_t ec_gf8_mul_DA_ops[] = { + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR3, 8, 2, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DA = { + 9, + { 2, 5, 7, 1, 0, 4, 3, 6, 8, }, + ec_gf8_mul_DA_ops +}; + +static ec_gf_op_t ec_gf8_mul_DB_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 8, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DB = { + 9, + { 7, 5, 6, 2, 3, 4, 1, 0, 8, }, + ec_gf8_mul_DB_ops +}; + +static ec_gf_op_t ec_gf8_mul_DC_ops[] = { + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DC = { + 8, + { 4, 5, 2, 6, 7, 1, 0, 3, }, + ec_gf8_mul_DC_ops +}; + +static ec_gf_op_t ec_gf8_mul_DD_ops[] = { + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DD = { + 8, + { 1, 2, 3, 6, 7, 0, 4, 5, }, + ec_gf8_mul_DD_ops +}; + +static ec_gf_op_t ec_gf8_mul_DE_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DE = { + 8, + { 0, 5, 2, 6, 7, 1, 3, 4, }, + ec_gf8_mul_DE_ops +}; + +static ec_gf_op_t ec_gf8_mul_DF_ops[] = { + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_COPY, 9, 0, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 8, 7, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR3, 1, 9, 2 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DF = { + 10, + { 7, 2, 8, 4, 3, 1, 0, 6, 5, 9, }, + ec_gf8_mul_DF_ops +}; + +static ec_gf_op_t ec_gf8_mul_E0_ops[] = { + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E0 = { + 8, + { 2, 3, 4, 7, 5, 6, 0, 1, }, + ec_gf8_mul_E0_ops +}; + +static ec_gf_op_t ec_gf8_mul_E1_ops[] = { + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 8, 7, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR3, 9, 5, 3 }, + { EC_GF_OP_XOR2, 0, 9, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 4, 9, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E1 = { + 10, + { 0, 7, 1, 3, 4, 5, 6, 2, 8, 9, }, + ec_gf8_mul_E1_ops +}; + +static ec_gf_op_t ec_gf8_mul_E2_ops[] = { + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E2 = { + 8, + { 2, 3, 7, 1, 5, 6, 0, 4, }, + ec_gf8_mul_E2_ops +}; + +static ec_gf_op_t ec_gf8_mul_E3_ops[] = { + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR3, 8, 2, 7 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 0, 8, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR3, 6, 8, 4 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E3 = { + 9, + { 5, 4, 7, 2, 1, 3, 6, 0, 8, }, + ec_gf8_mul_E3_ops +}; + +static ec_gf_op_t ec_gf8_mul_E4_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 4, 5, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E4 = { + 8, + { 7, 0, 1, 6, 3, 4, 2, 5, }, + ec_gf8_mul_E4_ops +}; + +static ec_gf_op_t ec_gf8_mul_E5_ops[] = { + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E5 = { + 9, + { 4, 5, 3, 6, 7, 1, 0, 2, 8, }, + ec_gf8_mul_E5_ops +}; + +static ec_gf_op_t ec_gf8_mul_E6_ops[] = { + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E6 = { + 8, + { 5, 4, 3, 6, 7, 0, 1, 2, }, + ec_gf8_mul_E6_ops +}; + +static ec_gf_op_t ec_gf8_mul_E7_ops[] = { + { EC_GF_OP_COPY, 8, 6, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR3, 9, 0, 6 }, + { EC_GF_OP_XOR2, 4, 9, 0 }, + { EC_GF_OP_XOR2, 5, 9, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E7 = { + 10, + { 1, 4, 3, 6, 7, 5, 2, 0, 8, 9, }, + ec_gf8_mul_E7_ops +}; + +static ec_gf_op_t ec_gf8_mul_E8_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 1, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E8 = { + 8, + { 1, 4, 2, 7, 3, 0, 5, 6, }, + ec_gf8_mul_E8_ops +}; + +static ec_gf_op_t ec_gf8_mul_E9_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_COPY, 8, 1, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 6, 3, 0 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR3, 1, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E9 = { + 9, + { 6, 2, 0, 3, 4, 1, 5, 7, 8, }, + ec_gf8_mul_E9_ops +}; + +static ec_gf_op_t ec_gf8_mul_EA_ops[] = { + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EA = { + 8, + { 3, 4, 5, 6, 7, 0, 1, 2, }, + ec_gf8_mul_EA_ops +}; + +static ec_gf_op_t ec_gf8_mul_EB_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EB = { + 8, + { 3, 4, 5, 6, 7, 0, 1, 2, }, + ec_gf8_mul_EB_ops +}; + +static ec_gf_op_t ec_gf8_mul_EC_ops[] = { + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR3, 8, 4, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EC = { + 9, + { 7, 4, 3, 0, 2, 5, 1, 6, 8, }, + ec_gf8_mul_EC_ops +}; + +static ec_gf_op_t ec_gf8_mul_ED_ops[] = { + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_ED = { + 8, + { 5, 6, 7, 0, 1, 4, 3, 2, }, + ec_gf8_mul_ED_ops +}; + +static ec_gf_op_t ec_gf8_mul_EE_ops[] = { + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR3, 8, 2, 3 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 8, 5, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EE = { + 9, + { 6, 4, 5, 7, 2, 3, 0, 1, 8, }, + ec_gf8_mul_EE_ops +}; + +static ec_gf_op_t ec_gf8_mul_EF_ops[] = { + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_COPY, 8, 0, 0 }, + { EC_GF_OP_XOR2, 8, 2, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 6, 8, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EF = { + 9, + { 6, 4, 5, 7, 2, 0, 3, 1, 8, }, + ec_gf8_mul_EF_ops +}; + +static ec_gf_op_t ec_gf8_mul_F0_ops[] = { + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR3, 8, 3, 6 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 8, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 1, 8, 0 }, + { EC_GF_OP_XOR2, 0, 2, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F0 = { + 9, + { 3, 4, 6, 1, 2, 0, 5, 7, 8, }, + ec_gf8_mul_F0_ops +}; + +static ec_gf_op_t ec_gf8_mul_F1_ops[] = { + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_COPY, 9, 2, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 9, 0, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 5, 2, 0 }, + { EC_GF_OP_XOR2, 7, 9, 0 }, + { EC_GF_OP_XOR2, 4, 9, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR3, 9, 8, 7 }, + { EC_GF_OP_XOR2, 1, 9, 0 }, + { EC_GF_OP_XOR2, 5, 9, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F1 = { + 10, + { 7, 2, 6, 3, 5, 1, 4, 0, 8, 9, }, + ec_gf8_mul_F1_ops +}; + +static ec_gf_op_t ec_gf8_mul_F2_ops[] = { + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 2, 3, 0 }, + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_XOR3, 8, 6, 4 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F2 = { + 9, + { 1, 0, 6, 7, 4, 5, 2, 3, 8, }, + ec_gf8_mul_F2_ops +}; + +static ec_gf_op_t ec_gf8_mul_F3_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F3 = { + 8, + { 5, 6, 7, 0, 1, 2, 3, 4, }, + ec_gf8_mul_F3_ops +}; + +static ec_gf_op_t ec_gf8_mul_F4_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F4 = { + 8, + { 0, 1, 2, 3, 4, 5, 6, 7, }, + ec_gf8_mul_F4_ops +}; + +static ec_gf_op_t ec_gf8_mul_F5_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F5 = { + 8, + { 7, 0, 1, 2, 3, 4, 5, 6, }, + ec_gf8_mul_F5_ops +}; + +static ec_gf_op_t ec_gf8_mul_F6_ops[] = { + { EC_GF_OP_XOR2, 3, 1, 0 }, + { EC_GF_OP_COPY, 8, 3, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_COPY, 9, 3, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 9, 4, 0 }, + { EC_GF_OP_XOR2, 4, 1, 0 }, + { EC_GF_OP_XOR2, 6, 9, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 5, 7, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR3, 7, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F6 = { + 10, + { 0, 6, 2, 7, 4, 3, 5, 9, 1, 8, }, + ec_gf8_mul_F6_ops +}; + +static ec_gf_op_t ec_gf8_mul_F7_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F7 = { + 8, + { 6, 7, 0, 1, 2, 3, 4, 5, }, + ec_gf8_mul_F7_ops +}; + +static ec_gf_op_t ec_gf8_mul_F8_ops[] = { + { EC_GF_OP_XOR2, 4, 0, 0 }, + { EC_GF_OP_XOR2, 3, 5, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F8 = { + 8, + { 6, 2, 0, 1, 4, 5, 3, 7, }, + ec_gf8_mul_F8_ops +}; + +static ec_gf_op_t ec_gf8_mul_F9_ops[] = { + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 6, 4, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR3, 8, 7, 1 }, + { EC_GF_OP_XOR2, 1, 3, 0 }, + { EC_GF_OP_XOR2, 4, 8, 0 }, + { EC_GF_OP_XOR2, 5, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F9 = { + 9, + { 4, 1, 7, 6, 0, 3, 5, 2, 8, }, + ec_gf8_mul_F9_ops +}; + +static ec_gf_op_t ec_gf8_mul_FA_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 7, 2, 0 }, + { EC_GF_OP_XOR2, 1, 5, 0 }, + { EC_GF_OP_XOR2, 3, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 0, 3, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FA = { + 8, + { 0, 1, 2, 4, 5, 6, 7, 3, }, + ec_gf8_mul_FA_ops +}; + +static ec_gf_op_t ec_gf8_mul_FB_ops[] = { + { EC_GF_OP_XOR2, 1, 0, 0 }, + { EC_GF_OP_XOR2, 2, 1, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 3, 2, 0 }, + { EC_GF_OP_XOR2, 0, 7, 0 }, + { EC_GF_OP_XOR2, 2, 7, 0 }, + { EC_GF_OP_XOR2, 1, 6, 0 }, + { EC_GF_OP_XOR2, 7, 6, 0 }, + { EC_GF_OP_XOR2, 4, 3, 0 }, + { EC_GF_OP_XOR2, 6, 5, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FB = { + 8, + { 4, 5, 6, 7, 0, 1, 2, 3, }, + ec_gf8_mul_FB_ops +}; + +static ec_gf_op_t ec_gf8_mul_FC_ops[] = { + { EC_GF_OP_XOR2, 7, 0, 0 }, + { EC_GF_OP_XOR2, 7, 4, 0 }, + { EC_GF_OP_XOR2, 5, 1, 0 }, + { EC_GF_OP_COPY, 9, 3, 0 }, + { EC_GF_OP_XOR3, 8, 5, 7 }, + { EC_GF_OP_XOR2, 3, 6, 0 }, + { EC_GF_OP_XOR2, 8, 3, 0 }, + { EC_GF_OP_XOR2, 2, 8, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 3, 4, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 6, 0, 0 }, + { EC_GF_OP_XOR3, 0, 9, 2 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FC = { + 10, + { 5, 6, 3, 7, 1, 8, 0, 4, 2, 9, }, + ec_gf8_mul_FC_ops +}; + +static ec_gf_op_t ec_gf8_mul_FD_ops[] = { + { EC_GF_OP_XOR2, 7, 1, 0 }, + { EC_GF_OP_COPY, 8, 7, 0 }, + { EC_GF_OP_XOR2, 5, 0, 0 }, + { EC_GF_OP_XOR2, 7, 5, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 2, 5, 0 }, + { EC_GF_OP_XOR2, 1, 2, 0 }, + { EC_GF_OP_XOR2, 0, 1, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR3, 1, 8, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FD = { + 9, + { 5, 3, 7, 6, 1, 2, 4, 0, 8, }, + ec_gf8_mul_FD_ops +}; + +static ec_gf_op_t ec_gf8_mul_FE_ops[] = { + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_COPY, 8, 2, 0 }, + { EC_GF_OP_XOR2, 2, 4, 0 }, + { EC_GF_OP_XOR2, 6, 2, 0 }, + { EC_GF_OP_XOR2, 8, 5, 0 }, + { EC_GF_OP_XOR2, 5, 6, 0 }, + { EC_GF_OP_XOR2, 6, 1, 0 }, + { EC_GF_OP_XOR2, 0, 6, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 7, 8, 0 }, + { EC_GF_OP_XOR2, 3, 0, 0 }, + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR2, 0, 4, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FE = { + 9, + { 3, 4, 8, 2, 5, 0, 6, 1, 7, }, + ec_gf8_mul_FE_ops +}; + +static ec_gf_op_t ec_gf8_mul_FF_ops[] = { + { EC_GF_OP_XOR2, 4, 7, 0 }, + { EC_GF_OP_COPY, 9, 0, 0 }, + { EC_GF_OP_COPY, 8, 4, 0 }, + { EC_GF_OP_XOR2, 9, 1, 0 }, + { EC_GF_OP_XOR2, 4, 2, 0 }, + { EC_GF_OP_XOR2, 9, 4, 0 }, + { EC_GF_OP_XOR2, 0, 5, 0 }, + { EC_GF_OP_XOR2, 2, 0, 0 }, + { EC_GF_OP_XOR2, 3, 9, 0 }, + { EC_GF_OP_XOR2, 7, 3, 0 }, + { EC_GF_OP_XOR2, 2, 6, 0 }, + { EC_GF_OP_XOR2, 5, 3, 0 }, + { EC_GF_OP_XOR2, 6, 7, 0 }, + { EC_GF_OP_XOR2, 1, 7, 0 }, + { EC_GF_OP_XOR3, 3, 8, 5 }, + { EC_GF_OP_XOR2, 4, 6, 0 }, + { EC_GF_OP_END, 0, 0, 0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FF = { + 10, + { 6, 5, 0, 1, 2, 4, 9, 3, 7, 8, }, + ec_gf8_mul_FF_ops +}; + +ec_gf_mul_t *ec_gf8_mul[] = { + &ec_gf8_mul_00, &ec_gf8_mul_01, &ec_gf8_mul_02, &ec_gf8_mul_03, + &ec_gf8_mul_04, &ec_gf8_mul_05, &ec_gf8_mul_06, &ec_gf8_mul_07, + &ec_gf8_mul_08, &ec_gf8_mul_09, &ec_gf8_mul_0A, &ec_gf8_mul_0B, + &ec_gf8_mul_0C, &ec_gf8_mul_0D, &ec_gf8_mul_0E, &ec_gf8_mul_0F, + &ec_gf8_mul_10, &ec_gf8_mul_11, &ec_gf8_mul_12, &ec_gf8_mul_13, + &ec_gf8_mul_14, &ec_gf8_mul_15, &ec_gf8_mul_16, &ec_gf8_mul_17, + &ec_gf8_mul_18, &ec_gf8_mul_19, &ec_gf8_mul_1A, &ec_gf8_mul_1B, + &ec_gf8_mul_1C, &ec_gf8_mul_1D, &ec_gf8_mul_1E, &ec_gf8_mul_1F, + &ec_gf8_mul_20, &ec_gf8_mul_21, &ec_gf8_mul_22, &ec_gf8_mul_23, + &ec_gf8_mul_24, &ec_gf8_mul_25, &ec_gf8_mul_26, &ec_gf8_mul_27, + &ec_gf8_mul_28, &ec_gf8_mul_29, &ec_gf8_mul_2A, &ec_gf8_mul_2B, + &ec_gf8_mul_2C, &ec_gf8_mul_2D, &ec_gf8_mul_2E, &ec_gf8_mul_2F, + &ec_gf8_mul_30, &ec_gf8_mul_31, &ec_gf8_mul_32, &ec_gf8_mul_33, + &ec_gf8_mul_34, &ec_gf8_mul_35, &ec_gf8_mul_36, &ec_gf8_mul_37, + &ec_gf8_mul_38, &ec_gf8_mul_39, &ec_gf8_mul_3A, &ec_gf8_mul_3B, + &ec_gf8_mul_3C, &ec_gf8_mul_3D, &ec_gf8_mul_3E, &ec_gf8_mul_3F, + &ec_gf8_mul_40, &ec_gf8_mul_41, &ec_gf8_mul_42, &ec_gf8_mul_43, + &ec_gf8_mul_44, &ec_gf8_mul_45, &ec_gf8_mul_46, &ec_gf8_mul_47, + &ec_gf8_mul_48, &ec_gf8_mul_49, &ec_gf8_mul_4A, &ec_gf8_mul_4B, + &ec_gf8_mul_4C, &ec_gf8_mul_4D, &ec_gf8_mul_4E, &ec_gf8_mul_4F, + &ec_gf8_mul_50, &ec_gf8_mul_51, &ec_gf8_mul_52, &ec_gf8_mul_53, + &ec_gf8_mul_54, &ec_gf8_mul_55, &ec_gf8_mul_56, &ec_gf8_mul_57, + &ec_gf8_mul_58, &ec_gf8_mul_59, &ec_gf8_mul_5A, &ec_gf8_mul_5B, + &ec_gf8_mul_5C, &ec_gf8_mul_5D, &ec_gf8_mul_5E, &ec_gf8_mul_5F, + &ec_gf8_mul_60, &ec_gf8_mul_61, &ec_gf8_mul_62, &ec_gf8_mul_63, + &ec_gf8_mul_64, &ec_gf8_mul_65, &ec_gf8_mul_66, &ec_gf8_mul_67, + &ec_gf8_mul_68, &ec_gf8_mul_69, &ec_gf8_mul_6A, &ec_gf8_mul_6B, + &ec_gf8_mul_6C, &ec_gf8_mul_6D, &ec_gf8_mul_6E, &ec_gf8_mul_6F, + &ec_gf8_mul_70, &ec_gf8_mul_71, &ec_gf8_mul_72, &ec_gf8_mul_73, + &ec_gf8_mul_74, &ec_gf8_mul_75, &ec_gf8_mul_76, &ec_gf8_mul_77, + &ec_gf8_mul_78, &ec_gf8_mul_79, &ec_gf8_mul_7A, &ec_gf8_mul_7B, + &ec_gf8_mul_7C, &ec_gf8_mul_7D, &ec_gf8_mul_7E, &ec_gf8_mul_7F, + &ec_gf8_mul_80, &ec_gf8_mul_81, &ec_gf8_mul_82, &ec_gf8_mul_83, + &ec_gf8_mul_84, &ec_gf8_mul_85, &ec_gf8_mul_86, &ec_gf8_mul_87, + &ec_gf8_mul_88, &ec_gf8_mul_89, &ec_gf8_mul_8A, &ec_gf8_mul_8B, + &ec_gf8_mul_8C, &ec_gf8_mul_8D, &ec_gf8_mul_8E, &ec_gf8_mul_8F, + &ec_gf8_mul_90, &ec_gf8_mul_91, &ec_gf8_mul_92, &ec_gf8_mul_93, + &ec_gf8_mul_94, &ec_gf8_mul_95, &ec_gf8_mul_96, &ec_gf8_mul_97, + &ec_gf8_mul_98, &ec_gf8_mul_99, &ec_gf8_mul_9A, &ec_gf8_mul_9B, + &ec_gf8_mul_9C, &ec_gf8_mul_9D, &ec_gf8_mul_9E, &ec_gf8_mul_9F, + &ec_gf8_mul_A0, &ec_gf8_mul_A1, &ec_gf8_mul_A2, &ec_gf8_mul_A3, + &ec_gf8_mul_A4, &ec_gf8_mul_A5, &ec_gf8_mul_A6, &ec_gf8_mul_A7, + &ec_gf8_mul_A8, &ec_gf8_mul_A9, &ec_gf8_mul_AA, &ec_gf8_mul_AB, + &ec_gf8_mul_AC, &ec_gf8_mul_AD, &ec_gf8_mul_AE, &ec_gf8_mul_AF, + &ec_gf8_mul_B0, &ec_gf8_mul_B1, &ec_gf8_mul_B2, &ec_gf8_mul_B3, + &ec_gf8_mul_B4, &ec_gf8_mul_B5, &ec_gf8_mul_B6, &ec_gf8_mul_B7, + &ec_gf8_mul_B8, &ec_gf8_mul_B9, &ec_gf8_mul_BA, &ec_gf8_mul_BB, + &ec_gf8_mul_BC, &ec_gf8_mul_BD, &ec_gf8_mul_BE, &ec_gf8_mul_BF, + &ec_gf8_mul_C0, &ec_gf8_mul_C1, &ec_gf8_mul_C2, &ec_gf8_mul_C3, + &ec_gf8_mul_C4, &ec_gf8_mul_C5, &ec_gf8_mul_C6, &ec_gf8_mul_C7, + &ec_gf8_mul_C8, &ec_gf8_mul_C9, &ec_gf8_mul_CA, &ec_gf8_mul_CB, + &ec_gf8_mul_CC, &ec_gf8_mul_CD, &ec_gf8_mul_CE, &ec_gf8_mul_CF, + &ec_gf8_mul_D0, &ec_gf8_mul_D1, &ec_gf8_mul_D2, &ec_gf8_mul_D3, + &ec_gf8_mul_D4, &ec_gf8_mul_D5, &ec_gf8_mul_D6, &ec_gf8_mul_D7, + &ec_gf8_mul_D8, &ec_gf8_mul_D9, &ec_gf8_mul_DA, &ec_gf8_mul_DB, + &ec_gf8_mul_DC, &ec_gf8_mul_DD, &ec_gf8_mul_DE, &ec_gf8_mul_DF, + &ec_gf8_mul_E0, &ec_gf8_mul_E1, &ec_gf8_mul_E2, &ec_gf8_mul_E3, + &ec_gf8_mul_E4, &ec_gf8_mul_E5, &ec_gf8_mul_E6, &ec_gf8_mul_E7, + &ec_gf8_mul_E8, &ec_gf8_mul_E9, &ec_gf8_mul_EA, &ec_gf8_mul_EB, + &ec_gf8_mul_EC, &ec_gf8_mul_ED, &ec_gf8_mul_EE, &ec_gf8_mul_EF, + &ec_gf8_mul_F0, &ec_gf8_mul_F1, &ec_gf8_mul_F2, &ec_gf8_mul_F3, + &ec_gf8_mul_F4, &ec_gf8_mul_F5, &ec_gf8_mul_F6, &ec_gf8_mul_F7, + &ec_gf8_mul_F8, &ec_gf8_mul_F9, &ec_gf8_mul_FA, &ec_gf8_mul_FB, + &ec_gf8_mul_FC, &ec_gf8_mul_FD, &ec_gf8_mul_FE, &ec_gf8_mul_FF +}; diff --git a/xlators/cluster/ec/src/ec-gf.h b/xlators/cluster/ec/src/ec-gf8.h index 23bca91e3b5..4aca91127fc 100644 --- a/xlators/cluster/ec/src/ec-gf.h +++ b/xlators/cluster/ec/src/ec-gf8.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -11,13 +11,8 @@ #ifndef __EC_GF8_H__ #define __EC_GF8_H__ -#define EC_GF_BITS 8 -#define EC_GF_MOD 0x11D +#include "ec-galois.h" -#define EC_GF_SIZE (1 << EC_GF_BITS) -#define EC_GF_WORD_SIZE sizeof(uint64_t) - -extern void (* ec_gf_muladd[])(uint8_t * out, uint8_t * in, - unsigned int width); +extern ec_gf_mul_t *ec_gf8_mul[]; #endif /* __EC_GF8_H__ */ diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 7fe1b2c4f8b..14255616830 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -11,21 +11,22 @@ #include "xlator.h" #include "defaults.h" #include "compat-errno.h" +#include "byte-order.h" +#include "syncop.h" +#include "syncop-utils.h" +#include "cluster-syncop.h" +#include "ec.h" +#include "ec-mem-types.h" +#include "ec-types.h" +#include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" -#include "ec-mem-types.h" -#include "ec-data.h" -#include "byte-order.h" -#include "ec-messages.h" -#include "syncop.h" -#include "syncop-utils.h" -#include "cluster-syncop.h" - +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; }) #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; }) #define EC_INTERSECT(dst, src1, src2, max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i]; }) #define EC_ADJUST_SOURCE(source, sources, max) ({int __i; if (sources[source] == 0) {source = -1; for (__i = 0; __i < max; __i++) if (sources[__i]) source = __i; } }) diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h index 0929044d545..4ae02e2df3c 100644 --- a/xlators/cluster/ec/src/ec-heald.h +++ b/xlators/cluster/ec/src/ec-heald.h @@ -13,32 +13,7 @@ #include "xlator.h" -struct _ec; -typedef struct _ec ec_t; - -struct subvol_healer { - xlator_t *this; - int subvol; - gf_boolean_t local; - gf_boolean_t running; - gf_boolean_t rerun; - pthread_mutex_t mutex; - pthread_cond_t cond; - pthread_t thread; -}; - -struct _ec_self_heald; -typedef struct _ec_self_heald ec_self_heald_t; - -struct _ec_self_heald { - gf_boolean_t iamshd; - gf_boolean_t enabled; - int timeout; - uint32_t max_threads; - uint32_t wait_qlength; - struct subvol_healer *index_healers; - struct subvol_healer *full_healers; -}; +#include "ec-types.h" int ec_xl_op (xlator_t *this, dict_t *input, dict_t *output); @@ -46,4 +21,5 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output); int ec_selfheal_daemon_init (xlator_t *this); void ec_selfheal_childup (ec_t *ec, int child); + #endif /* __EC_HEALD_H__ */ diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 7cf8232353d..2391b2de3ae 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -12,10 +12,12 @@ #include "byte-order.h" +#include "ec.h" #include "ec-mem-types.h" +#include "ec-messages.h" #include "ec-fops.h" +#include "ec-method.h" #include "ec-helpers.h" -#include "ec-messages.h" static const char * ec_fop_list[] = { @@ -137,6 +139,53 @@ size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count, return total; } +int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, + void **ptr) +{ + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + int32_t ret = -ENOMEM; + + iobuf = iobuf_get_page_aligned (xl->ctx->iobuf_pool, size, + EC_METHOD_WORD_SIZE); + if (iobuf == NULL) { + goto out; + } + + iobref = *piobref; + if (iobref == NULL) { + iobref = iobref_new(); + if (iobref == NULL) { + goto out; + } + } + + ret = iobref_add(iobref, iobuf); + if (ret != 0) { + if (iobref != *piobref) { + iobref_unref(iobref); + } + iobref = NULL; + + goto out; + } + + GF_ASSERT(EC_ALIGN_CHECK(iobuf->ptr, EC_METHOD_WORD_SIZE)); + + *ptr = iobuf->ptr; + +out: + if (iobuf != NULL) { + iobuf_unref(iobuf); + } + + if (iobref != NULL) { + *piobref = iobref; + } + + return ret; +} + int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t value[], int32_t size) { diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index 93d77726089..dfea6fef537 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -11,7 +11,10 @@ #ifndef __EC_HELPERS_H__ #define __EC_HELPERS_H__ -#include "ec-data.h" +#include "ec-types.h" + +#define EC_ALIGN_CHECK(_ptr, _align) \ + ((((uintptr_t)(_ptr)) & ((_align) - 1)) == 0) const char * ec_bin(char * str, size_t size, uint64_t value, int32_t digits); const char * ec_fop_name(int32_t id); @@ -19,7 +22,8 @@ void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...); int32_t ec_bits_consume(uint64_t * n); size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count, off_t offset, size_t size); - +int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, + void **ptr); int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t *value, int32_t size); int32_t ec_dict_del_array(dict_t *dict, char *key, diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index c3d9c879eb7..6752b675273 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -11,12 +11,13 @@ #include "xlator.h" #include "defaults.h" +#include "ec.h" +#include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" -#include "ec-messages.h" /* FOP: access */ @@ -1140,12 +1141,12 @@ out: int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) { - ec_cbk_data_t * ans = NULL; - struct iobref * iobref = NULL; - struct iobuf * iobuf = NULL; - uint8_t * buff = NULL, * ptr; + struct iovec vector[1]; + ec_cbk_data_t *ans = NULL; + struct iobref *iobref = NULL; + void *ptr; size_t fsize = 0, size = 0, max = 0; - int32_t i = 0, err = -ENOMEM; + int32_t pos, err = -ENOMEM; if (cbk->op_ret < 0) { err = -cbk->op_errno; @@ -1157,47 +1158,38 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size)); if (cbk->op_ret > 0) { - struct iovec vector[1]; - uint8_t * blocks[cbk->count]; + void *blocks[cbk->count]; uint32_t values[cbk->count]; fsize = cbk->op_ret; size = fsize * ec->fragments; - buff = GF_MALLOC(size, gf_common_mt_char); - if (buff == NULL) { - goto out; - } - ptr = buff; - for (i = 0, ans = cbk; ans != NULL; i++, ans = ans->next) { - values[i] = ans->idx; - blocks[i] = ptr; - ptr += ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize); + for (ans = cbk; ans != NULL; ans = ans->next) { + pos = gf_bits_count(cbk->mask & ((1 << ans->idx) - 1)); + values[pos] = ans->idx + 1; + blocks[pos] = ans->vector[0].iov_base; + if ((ans->int32 != 1) || + !EC_ALIGN_CHECK(blocks[pos], EC_METHOD_WORD_SIZE)) { + if (iobref == NULL) { + err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr); + if (err != 0) { + goto out; + } + } + ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize); + blocks[pos] = ptr; + ptr += fsize; + } } - iobref = iobref_new(); - if (iobref == NULL) { - goto out; - } - iobuf = iobuf_get2(fop->xl->ctx->iobuf_pool, size); - if (iobuf == NULL) { - goto out; - } - err = iobref_add(iobref, iobuf); + err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr); if (err != 0) { goto out; } - vector[0].iov_base = iobuf->ptr; - vector[0].iov_len = ec_method_decode(fsize, ec->fragments, values, - blocks, iobuf->ptr); + ec_method_decode(&ec->matrix, fsize, cbk->mask, values, blocks, ptr); - iobuf_unref(iobuf); - - GF_FREE(buff); - buff = NULL; - - vector[0].iov_base += fop->head; - vector[0].iov_len -= fop->head; + vector[0].iov_base = ptr + fop->head; + vector[0].iov_len = size - fop->head; max = fop->offset * ec->fragments + size; if (max > cbk->iatt[0].ia_size) { @@ -1229,13 +1221,9 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) return 0; out: - if (iobuf != NULL) { - iobuf_unref(iobuf); - } if (iobref != NULL) { iobref_unref(iobref); } - GF_FREE(buff); return err; } diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 6aeda5a2481..88145d98c83 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -11,12 +11,13 @@ #include "xlator.h" #include "defaults.h" +#include "ec.h" +#include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" -#include "ec-messages.h" int ec_inode_write_cbk (call_frame_t *frame, xlator_t *this, void *cookie, @@ -1285,27 +1286,78 @@ out: return -1; } +static int32_t +ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop) +{ + struct iobref *iobref = NULL; + struct iovec *iov; + void *ptr; + int32_t err; + + fop->user_size = iov_length(fop->vector, fop->int32); + fop->head = ec_adjust_offset(ec, &fop->offset, 0); + fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0); + + if ((fop->int32 != 1) || (fop->head != 0) || + (fop->size > fop->user_size) || + !EC_ALIGN_CHECK(fop->vector[0].iov_base, EC_METHOD_WORD_SIZE)) { + err = ec_buffer_alloc(ec->xl, fop->size, &iobref, &ptr); + if (err != 0) { + goto out; + } + + ec_iov_copy_to(ptr + fop->head, fop->vector, fop->int32, 0, + fop->user_size); + + fop->vector[0].iov_base = ptr; + fop->vector[0].iov_len = fop->size; + + iobref_unref(fop->buffers); + fop->buffers = iobref; + } + + if (fop->int32 != 2) { + iov = GF_MALLOC(VECTORSIZE(2), gf_common_mt_iovec); + if (iov == NULL) { + err = -ENOMEM; + + goto out; + } + iov[0].iov_base = fop->vector[0].iov_base; + iov[0].iov_len = fop->vector[0].iov_len; + + GF_FREE(fop->vector); + fop->vector = iov; + } + + fop->vector[1].iov_len = fop->size / ec->fragments; + err = ec_buffer_alloc(ec->xl, fop->vector[1].iov_len * ec->nodes, + &fop->buffers, &fop->vector[1].iov_base); + if (err != 0) { + goto out; + } + + err = 0; + +out: + return err; +} + void ec_writev_start(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; - struct iobref *iobref = NULL; - struct iobuf *iobuf = NULL; - void *ptr = NULL; ec_fd_t *ctx; fd_t *fd; - size_t tail; - uint64_t current; + dict_t *xdata = NULL; + uint64_t tail, current; int32_t err = -ENOMEM; - dict_t *xdata = NULL; /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); fd = fd_anonymous(fop->fd->inode); if (fd == NULL) { - ec_fop_set_error(fop, ENOMEM); - - return; + goto failed; } fop->frame->root->uid = 0; @@ -1318,38 +1370,15 @@ void ec_writev_start(ec_fop_data_t *fop) } } - fop->user_size = iov_length(fop->vector, fop->int32); - fop->head = ec_adjust_offset(ec, &fop->offset, 0); - fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0); - - iobref = iobref_new(); - if (iobref == NULL) { - goto out; - } - iobuf = iobuf_get2(fop->xl->ctx->iobuf_pool, fop->size); - if (iobuf == NULL) { - goto out; - } - err = iobref_add(iobref, iobuf); + err = ec_writev_prepare_buffers(ec, fop); if (err != 0) { - goto out; + goto failed_fd; } - ptr = iobuf->ptr + fop->head; - ec_iov_copy_to(ptr, fop->vector, fop->int32, 0, fop->user_size); - - fop->vector[0].iov_base = iobuf->ptr; - fop->vector[0].iov_len = fop->size; - - iobuf_unref(iobuf); - - iobref_unref(fop->buffers); - fop->buffers = iobref; - if (fop->head > 0) { if (ec_make_internal_fop_xdata (&xdata)) { err = -ENOMEM; - goto out; + goto failed_xdata; } ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd, ec->stripe_size, fop->offset, 0, xdata); @@ -1359,7 +1388,7 @@ void ec_writev_start(ec_fop_data_t *fop) if (current > fop->offset + fop->head + fop->user_size) { if (ec_make_internal_fop_xdata (&xdata)) { err = -ENOMEM; - goto out; + goto failed_xdata; } ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd, ec->stripe_size, @@ -1369,24 +1398,15 @@ void ec_writev_start(ec_fop_data_t *fop) } } - fd_unref(fd); - if (xdata) - dict_unref (xdata); - - return; + err = 0; -out: - if (iobuf != NULL) { - iobuf_unref(iobuf); +failed_xdata: + if (xdata) { + dict_unref(xdata); } - if (iobref != NULL) { - iobref_unref(iobref); - } - +failed_fd: fd_unref(fd); - if (xdata) - dict_unref (xdata); - +failed: ec_fop_set_error(fop, -err); } @@ -1411,55 +1431,32 @@ void ec_wind_writev(ec_t * ec, ec_fop_data_t * fop, int32_t idx) ec_trace("WIND", fop, "idx=%d", idx); struct iovec vector[1]; - struct iobref * iobref = NULL; - struct iobuf * iobuf = NULL; - ssize_t size = 0, bufsize = 0; - int32_t err = -ENOMEM; + size_t size; - iobref = iobref_new(); - if (iobref == NULL) { - goto out; - } + size = fop->vector[1].iov_len; - size = fop->vector[0].iov_len; - bufsize = size / ec->fragments; - - iobuf = iobuf_get2(fop->xl->ctx->iobuf_pool, bufsize); - if (iobuf == NULL) { - goto out; - } - err = iobref_add(iobref, iobuf); - if (err != 0) { - goto out; - } - - ec_method_encode(size, ec->fragments, idx, fop->vector[0].iov_base, - iobuf->ptr); - - vector[0].iov_base = iobuf->ptr; - vector[0].iov_len = bufsize; - - iobuf_unref(iobuf); + vector[0].iov_base = fop->vector[1].iov_base + idx * size; + vector[0].iov_len = size; STACK_WIND_COOKIE(fop->frame, ec_writev_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->writev, fop->fd, vector, 1, fop->offset / ec->fragments, - fop->uint32, iobref, fop->xdata); - - iobref_unref(iobref); + fop->uint32, fop->buffers, fop->xdata); +} - return; +static void +ec_writev_encode(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + void *blocks[ec->nodes]; + uint32_t i; -out: - if (iobuf != NULL) { - iobuf_unref(iobuf); + blocks[0] = fop->vector[1].iov_base; + for (i = 1; i < ec->nodes; i++) { + blocks[i] = blocks[i - 1] + fop->vector[1].iov_len; } - if (iobref != NULL) { - iobref_unref(iobref); - } - - ec_writev_cbk(fop->frame, (void *)(uintptr_t)idx, fop->xl, -1, -err, NULL, - NULL, NULL); + ec_method_encode(&ec->matrix, fop->vector[0].iov_len, + fop->vector[0].iov_base, blocks); } int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state) @@ -1488,6 +1485,8 @@ int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state) fop->frame->root->uid = fop->uid; fop->frame->root->gid = fop->gid; + ec_writev_encode(fop); + ec_dispatch_all(fop); return EC_STATE_PREPARE_ANSWER; diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h index df65a031590..9a4b6c58049 100644 --- a/xlators/cluster/ec/src/ec-mem-types.h +++ b/xlators/cluster/ec/src/ec-mem-types.h @@ -21,6 +21,10 @@ enum gf_ec_mem_types_ ec_mt_ec_fd_t, ec_mt_ec_heal_t, ec_mt_subvol_healer_t, + ec_mt_ec_gf_t, + ec_mt_ec_code_t, + ec_mt_ec_code_builder_t, + ec_mt_ec_matrix_t, ec_mt_end }; diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h index 76678f8f836..dcdf50b9503 100644 --- a/xlators/cluster/ec/src/ec-messages.h +++ b/xlators/cluster/ec/src/ec-messages.h @@ -45,7 +45,7 @@ */ #define GLFS_EC_COMP_BASE GLFS_MSGID_COMP_EC -#define GLFS_NUM_MESSAGES 66 +#define GLFS_NUM_MESSAGES 73 #define GLFS_MSGID_END (GLFS_EC_COMP_BASE + GLFS_NUM_MESSAGES + 1) /* Messaged with message IDs */ #define glfs_msg_start_x GLFS_EC_COMP_BASE, "Invalid: Start of messages" @@ -520,6 +520,55 @@ */ #define EC_MSG_CONFIG_XATTR_INVALID (GLFS_EC_COMP_BASE + 66) +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION (GLFS_EC_COMP_BASE + 67) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_NONE (GLFS_EC_COMP_BASE + 68) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_UNKNOWN (GLFS_EC_COMP_BASE + 69) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_UNSUPPORTED (GLFS_EC_COMP_BASE + 70) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_FAILED (GLFS_EC_COMP_BASE + 71) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_NO_GF (GLFS_EC_COMP_BASE + 72) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_MATRIX_FAILED (GLFS_EC_COMP_BASE + 73) + /*------------*/ #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c index faab0115cdd..d1b122fb6a4 100644 --- a/xlators/cluster/ec/src/ec-method.c +++ b/xlators/cluster/ec/src/ec-method.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -11,149 +11,432 @@ #include <string.h> #include <inttypes.h> -#include "ec-gf.h" +#include "ec-types.h" +#include "ec-mem-types.h" +#include "ec-galois.h" +#include "ec-code.h" #include "ec-method.h" -static uint32_t GfPow[EC_GF_SIZE << 1]; -static uint32_t GfLog[EC_GF_SIZE << 1]; +static void +ec_method_matrix_normal(ec_gf_t *gf, uint32_t *matrix, uint32_t columns, + uint32_t *values, uint32_t count) +{ + uint32_t i, j, v, tmp; + + columns--; + for (i = 0; i < count; i++) { + v = *values++; + *matrix++ = tmp = ec_gf_exp(gf, v, columns); + for (j = 0; j < columns; j++) { + *matrix++ = tmp = ec_gf_div(gf, tmp, v); + } + } +} + +static void +ec_method_matrix_inverse(ec_gf_t *gf, uint32_t *matrix, uint32_t *values, + uint32_t count) +{ + uint32_t a[count]; + uint32_t i, j, p, last, tmp; + + last = count - 1; + for (i = 0; i < last; i++) { + a[i] = 1; + } + a[i] = values[0]; + for (i = last; i > 0; i--) { + for (j = i - 1; j < last; j++) { + a[j] = a[j + 1] ^ ec_gf_mul(gf, values[i], a[j]); + } + a[j] = ec_gf_mul(gf, values[i], a[j]); + } + for (i = 0; i < count; i++) { + p = a[0]; + matrix += count; + *matrix = tmp = p ^ values[i]; + for (j = 1; j < last; j++) { + matrix += count; + *matrix = tmp = a[j] ^ ec_gf_mul(gf, values[i], tmp); + p = tmp ^ ec_gf_mul(gf, values[i], p); + } + for (j = 0; j < last; j++) { + *matrix = ec_gf_div(gf, *matrix, p); + matrix -= count; + } + *matrix = ec_gf_div(gf, 1, p); + matrix++; + } +} -void ec_method_initialize(void) +static gf_boolean_t +ec_method_matrix_init(ec_matrix_list_t *list, ec_matrix_t *matrix, + uintptr_t mask, uint32_t *rows, gf_boolean_t inverse) { uint32_t i; - GfPow[0] = 1; - GfLog[0] = EC_GF_SIZE; - for (i = 1; i < EC_GF_SIZE; i++) - { - GfPow[i] = GfPow[i - 1] << 1; - if (GfPow[i] >= EC_GF_SIZE) - { - GfPow[i] ^= EC_GF_MOD; + matrix->refs = 1; + matrix->mask = mask; + matrix->code = list->code; + matrix->columns = list->columns; + INIT_LIST_HEAD(&matrix->lru); + + if (inverse) { + matrix->rows = list->columns; + ec_method_matrix_inverse(matrix->code->gf, matrix->values, rows, + matrix->rows); + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].values = matrix->values + i * matrix->columns; + matrix->row_data[i].func.interleaved = + ec_code_build_interleaved(matrix->code, + EC_METHOD_WORD_SIZE, + matrix->row_data[i].values, + matrix->columns); + if (matrix->row_data[i].func.interleaved == NULL) { + return _gf_false; + } + } + } else { + matrix->rows = list->rows; + ec_method_matrix_normal(matrix->code->gf, matrix->values, + matrix->columns, rows, matrix->rows); + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].values = matrix->values + i * matrix->columns; + matrix->row_data[i].func.linear = + ec_code_build_linear(matrix->code, EC_METHOD_WORD_SIZE, + matrix->row_data[i].values, + matrix->columns); + if (matrix->row_data[i].func.linear == NULL) { + return _gf_false; + } + } + } + + return _gf_true; +} + +static void +ec_method_matrix_release(ec_matrix_t *matrix) +{ + uint32_t i; + + for (i = 0; i < matrix->rows; i++) { + if (matrix->row_data[i].func.linear != NULL) { + ec_code_release(matrix->code, &matrix->row_data[i].func); + matrix->row_data[i].func.linear = NULL; } - GfPow[i + EC_GF_SIZE - 1] = GfPow[i]; - GfLog[GfPow[i] + EC_GF_SIZE - 1] = GfLog[GfPow[i]] = i; } } -static uint32_t ec_method_mul(uint32_t a, uint32_t b) +static void +ec_method_matrix_destroy(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + list_del_init(&matrix->lru); + + ec_method_matrix_release(matrix); + + mem_put(matrix); + + list->count--; +} + +static void +ec_method_matrix_unref(ec_matrix_list_t *list, ec_matrix_t *matrix) { - if (a && b) - { - return GfPow[GfLog[a] + GfLog[b]]; + if (--matrix->refs == 0) { + list_add_tail(&matrix->lru, &list->lru); + if (list->count > list->max) { + matrix = list_first_entry(&list->lru, ec_matrix_t, lru); + ec_method_matrix_destroy(list, matrix); + } } - return 0; } -static uint32_t ec_method_div(uint32_t a, uint32_t b) +static ec_matrix_t * +ec_method_matrix_lookup(ec_matrix_list_t *list, uintptr_t mask, uint32_t *pos) { - if (b) - { - if (a) - { - return GfPow[EC_GF_SIZE - 1 + GfLog[a] - GfLog[b]]; + ec_matrix_t *matrix; + uint32_t i, j, k; + + i = 0; + j = list->count; + while (i < j) { + k = (i + j) >> 1; + matrix = list->objects[k]; + if (matrix->mask == mask) { + *pos = k; + return matrix; + } + if (matrix->mask < mask) { + i = k + 1; + } else { + j = k; } - return 0; } - return EC_GF_SIZE; + *pos = i; + + return NULL; } -size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row, - uint8_t * in, uint8_t * out) +static void +ec_method_matrix_remove(ec_matrix_list_t *list, uintptr_t mask) { - uint32_t i, j; + uint32_t pos; - size /= EC_METHOD_CHUNK_SIZE * columns; - row++; - for (j = 0; j < size; j++) - { - ec_gf_muladd[0](out, in, EC_METHOD_WIDTH); - in += EC_METHOD_CHUNK_SIZE; - for (i = 1; i < columns; i++) - { - ec_gf_muladd[row](out, in, EC_METHOD_WIDTH); - in += EC_METHOD_CHUNK_SIZE; + if (ec_method_matrix_lookup(list, mask, &pos) != NULL) { + list->count--; + if (pos < list->count) { + memmove(list->objects + pos, list->objects + pos + 1, + sizeof(ec_matrix_t *) * (list->count - pos)); } - out += EC_METHOD_CHUNK_SIZE; } +} + +static void +ec_method_matrix_insert(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + uint32_t pos; + + GF_ASSERT(ec_method_matrix_lookup(list, matrix->mask, &pos) == NULL); - return size * EC_METHOD_CHUNK_SIZE; + if (pos < list->count) { + memmove(list->objects + pos + 1, list->objects + pos, + sizeof(ec_matrix_t *) * (list->count - pos)); + } + list->objects[pos] = matrix; + list->count++; } -size_t ec_method_decode(size_t size, uint32_t columns, uint32_t * rows, - uint8_t ** in, uint8_t * out) +static ec_matrix_t * +ec_method_matrix_get(ec_matrix_list_t *list, uintptr_t mask, uint32_t *rows) { - uint32_t i, j, k, off, last, value; - uint32_t f; - uint8_t inv[EC_METHOD_MAX_FRAGMENTS][EC_METHOD_MAX_FRAGMENTS + 1]; - uint8_t mtx[EC_METHOD_MAX_FRAGMENTS][EC_METHOD_MAX_FRAGMENTS]; - uint8_t dummy[EC_METHOD_CHUNK_SIZE]; + ec_matrix_t *matrix; + uint32_t pos; + + LOCK(&list->lock); - size /= EC_METHOD_CHUNK_SIZE; + matrix = ec_method_matrix_lookup(list, mask, &pos); + if (matrix != NULL) { + list_del_init(&matrix->lru); + matrix->refs++; - memset(inv, 0, sizeof(inv)); - memset(mtx, 0, sizeof(mtx)); - memset(dummy, 0, sizeof(dummy)); - for (i = 0; i < columns; i++) - { - inv[i][i] = 1; - inv[i][columns] = 1; + goto out; } - for (i = 0; i < columns; i++) - { - mtx[i][columns - 1] = 1; - for (j = columns - 1; j > 0; j--) - { - mtx[i][j - 1] = ec_method_mul(mtx[i][j], rows[i] + 1); + + if ((list->count >= list->max) && !list_empty(&list->lru)) { + matrix = list_first_entry(&list->lru, ec_matrix_t, lru); + list_del_init(&matrix->lru); + + ec_method_matrix_remove(list, matrix->mask); + + ec_method_matrix_release(matrix); + } else { + matrix = mem_get0(list->pool); + if (matrix == NULL) { + goto out; } + matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * list->columns); } - for (i = 0; i < columns; i++) - { - f = mtx[i][i]; - for (j = 0; j < columns; j++) - { - mtx[i][j] = ec_method_div(mtx[i][j], f); - inv[i][j] = ec_method_div(inv[i][j], f); - } - for (j = 0; j < columns; j++) - { - if (i != j) - { - f = mtx[j][i]; - for (k = 0; k < columns; k++) - { - mtx[j][k] ^= ec_method_mul(mtx[i][k], f); - inv[j][k] ^= ec_method_mul(inv[i][k], f); - } - } + if (!ec_method_matrix_init(list, matrix, mask, rows, _gf_true)) { + ec_method_matrix_unref(list, matrix); + + matrix = NULL; + + goto out; + } + + if (list->count < list->max) { + ec_method_matrix_insert(list, matrix); + } else { + matrix->mask = 0; + } + +out: + UNLOCK(&list->lock); + + return matrix; +} + +static void +ec_method_matrix_put(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + LOCK(&list->lock); + + ec_method_matrix_unref(list, matrix); + + UNLOCK(&list->lock); +} + +static gf_boolean_t +ec_method_setup(xlator_t *xl, ec_matrix_list_t *list, const char *gen) +{ + ec_matrix_t *matrix; + uint32_t values[list->rows]; + uint32_t i; + + matrix = GF_MALLOC(sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * list->rows + + sizeof(uint32_t) * list->columns * list->rows, + ec_mt_ec_matrix_t); + if (matrix == NULL) { + goto failed; + } + memset(matrix, 0, sizeof(ec_matrix_t)); + matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * list->rows); + + list->code = ec_code_create(list->gf, ec_code_detect(xl, gen)); + if (list->code == NULL) { + goto failed_matrix; + } + list->width = list->code->width; + + for (i = 0; i < list->rows; i++) { + values[i] = i + 1; + } + if (!ec_method_matrix_init(list, matrix, 0, values, _gf_false)) { + goto failed_code; + } + + list->encode = matrix; + + return _gf_true; + +failed_code: + ec_code_destroy(list->code); +failed_matrix: + GF_FREE(matrix); +failed: + return _gf_false; +} + +gf_boolean_t +ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns, + uint32_t rows, uint32_t max, const char *gen) +{ + list->columns = columns; + list->rows = rows; + list->max = max; + list->stripe = EC_METHOD_CHUNK_SIZE * list->columns; + INIT_LIST_HEAD(&list->lru); + + list->pool = mem_pool_new_fn(sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * columns + + sizeof(uint32_t) * columns * columns, + 128, "ec_matrix_t"); + if (list->pool == NULL) { + goto failed; + } + + list->objects = GF_MALLOC(sizeof(ec_matrix_t *) * max, ec_mt_ec_matrix_t); + if (list->objects == NULL) { + goto failed_pool; + } + + list->gf = ec_gf_prepare(EC_GF_BITS, EC_GF_MOD); + if (list->gf == NULL) { + goto failed_objects; + } + + if (!ec_method_setup(xl, list, gen)) { + goto failed_gf; + } + + LOCK_INIT(&list->lock); + + return _gf_true; + +failed_gf: + ec_gf_destroy(list->gf); +failed_objects: + GF_FREE(list->objects); +failed_pool: + mem_pool_destroy(list->pool); +failed: + list->pool = NULL; + list->objects = NULL; + list->gf = NULL; + return _gf_false; +} + +void +ec_method_fini(ec_matrix_list_t *list) +{ + ec_matrix_t *matrix; + + if (list->encode == NULL) { + return; + } + + while (!list_empty(&list->lru)) { + matrix = list_first_entry(&list->lru, ec_matrix_t, lru); + ec_method_matrix_destroy(list, matrix); + } + + GF_ASSERT(list->count == 0); + + if (list->pool)/*Init was successful*/ + LOCK_DESTROY(&list->lock); + + ec_method_matrix_release(list->encode); + GF_FREE(list->encode); + + ec_code_destroy(list->code); + ec_gf_destroy(list->gf); + GF_FREE(list->objects); + mem_pool_destroy(list->pool); +} + +gf_boolean_t +ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen) +{ + /* TODO: Allow changing code generator */ + + return _gf_true; +} + +void +ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, void **out) +{ + ec_matrix_t *matrix; + size_t pos; + uint32_t i; + + matrix = list->encode; + for (pos = 0; pos < size; pos += list->stripe) { + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].func.linear(out[i], in, pos, + matrix->row_data[i].values, + list->columns); + out[i] += EC_METHOD_CHUNK_SIZE; } } - off = 0; - for (f = 0; f < size; f++) - { - for (i = 0; i < columns; i++) - { - last = 0; - j = 0; - do - { - while (inv[i][j] == 0) - { - j++; - } - if (j < columns) - { - value = ec_method_div(last, inv[i][j]); - last = inv[i][j]; - ec_gf_muladd[value](out, in[j] + off, EC_METHOD_WIDTH); - j++; - } - } while (j < columns); - ec_gf_muladd[last](out, dummy, EC_METHOD_WIDTH); +} + +gf_boolean_t +ec_method_decode(ec_matrix_list_t *list, size_t size, uintptr_t mask, + uint32_t *rows, void **in, void *out) +{ + ec_matrix_t *matrix; + size_t pos; + uint32_t i; + + matrix = ec_method_matrix_get(list, mask, rows); + if (matrix == NULL) { + return _gf_false; + } + for (pos = 0; pos < size; pos += EC_METHOD_CHUNK_SIZE) { + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].func.interleaved(out, in, pos, + matrix->row_data[i].values, + list->columns); out += EC_METHOD_CHUNK_SIZE; } - off += EC_METHOD_CHUNK_SIZE; } - return size * EC_METHOD_CHUNK_SIZE * columns; + ec_method_matrix_put(list, matrix); + + return _gf_true; } diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h index 29b46e10443..818b54de872 100644 --- a/xlators/cluster/ec/src/ec-method.h +++ b/xlators/cluster/ec/src/ec-method.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -11,7 +11,15 @@ #ifndef __EC_METHOD_H__ #define __EC_METHOD_H__ -#include "ec-gf.h" +#include "xlator.h" + +#include "ec-types.h" +#include "ec-galois.h" + +#define EC_GF_BITS 8 +#define EC_GF_MOD 0x11D + +#define EC_GF_SIZE (1 << EC_GF_BITS) /* Determines the maximum size of the matrix used to encode/decode data */ #define EC_METHOD_MAX_FRAGMENTS 16 @@ -21,12 +29,18 @@ #define EC_METHOD_WORD_SIZE 64 #define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_GF_BITS) -#define EC_METHOD_WIDTH (EC_METHOD_WORD_SIZE / EC_GF_WORD_SIZE) -void ec_method_initialize(void); -size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row, - uint8_t * in, uint8_t * out); -size_t ec_method_decode(size_t size, uint32_t columns, uint32_t * rows, - uint8_t ** in, uint8_t * out); +gf_boolean_t ec_method_init(xlator_t *xl, ec_matrix_list_t *list, + uint32_t columns, uint32_t rows, uint32_t max, + const char *gen); +void ec_method_fini(ec_matrix_list_t *list); +gf_boolean_t ec_method_update(xlator_t *xl, ec_matrix_list_t *list, + const char *gen); + +void ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, + void **out); +gf_boolean_t ec_method_decode(ec_matrix_list_t *list, size_t size, + uintptr_t mask, uint32_t *rows, void **in, + void *out); #endif /* __EC_METHOD_H__ */ diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h new file mode 100644 index 00000000000..29f892f01be --- /dev/null +++ b/xlators/cluster/ec/src/ec-types.h @@ -0,0 +1,580 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_TYPES_H__ +#define __EC_TYPES_H__ + +#include "xlator.h" +#include "timer.h" +#include "libxlator.h" + +#define EC_GF_MAX_REGS 16 + +#define EC_CODE_SIZE (1024 * 64) + +enum _ec_read_policy; +typedef enum _ec_read_policy ec_read_policy_t; + +struct _ec_config; +typedef struct _ec_config ec_config_t; + +struct _ec_fd; +typedef struct _ec_fd ec_fd_t; + +struct _ec_inode; +typedef struct _ec_inode ec_inode_t; + +union _ec_cbk; +typedef union _ec_cbk ec_cbk_t; + +struct _ec_lock; +typedef struct _ec_lock ec_lock_t; + +struct _ec_lock_link; +typedef struct _ec_lock_link ec_lock_link_t; + +struct _ec_fop_data; +typedef struct _ec_fop_data ec_fop_data_t; + +struct _ec_cbk_data; +typedef struct _ec_cbk_data ec_cbk_data_t; + +enum _ec_gf_opcode; +typedef enum _ec_gf_opcode ec_gf_opcode_t; + +struct _ec_gf_op; +typedef struct _ec_gf_op ec_gf_op_t; + +struct _ec_gf_mul; +typedef struct _ec_gf_mul ec_gf_mul_t; + +struct _ec_gf; +typedef struct _ec_gf ec_gf_t; + +struct _ec_code_gen; +typedef struct _ec_code_gen ec_code_gen_t; + +struct _ec_code; +typedef struct _ec_code ec_code_t; + +struct _ec_code_arg; +typedef struct _ec_code_arg ec_code_arg_t; + +struct _ec_code_op; +typedef struct _ec_code_op ec_code_op_t; + +struct _ec_code_builder; +typedef struct _ec_code_builder ec_code_builder_t; + +struct _ec_code_chunk; +typedef struct _ec_code_chunk ec_code_chunk_t; + +struct _ec_code_space; +typedef struct _ec_code_space ec_code_space_t; + +typedef void (*ec_code_func_linear_t)(void *dst, void *src, uint64_t offset, + uint32_t *values, uint32_t count); + +typedef void (*ec_code_func_interleaved_t)(void *dst, void **src, + uint64_t offset, uint32_t *values, + uint32_t count); + +union _ec_code_func; +typedef union _ec_code_func ec_code_func_t; + +struct _ec_matrix_row; +typedef struct _ec_matrix_row ec_matrix_row_t; + +struct _ec_matrix; +typedef struct _ec_matrix ec_matrix_t; + +struct _ec_matrix_list; +typedef struct _ec_matrix_list ec_matrix_list_t; + +struct _ec_heal; +typedef struct _ec_heal ec_heal_t; + +struct _ec_self_heald; +typedef struct _ec_self_heald ec_self_heald_t; + +struct _ec; +typedef struct _ec ec_t; + +typedef void (*ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t); +typedef int32_t (*ec_handler_f)(ec_fop_data_t *, int32_t); +typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t); + +enum _ec_read_policy { + EC_ROUND_ROBIN, + EC_GFID_HASH, + EC_READ_POLICY_MAX +}; + +struct _ec_config { + uint32_t version; + uint8_t algorithm; + uint8_t gf_word_size; + uint8_t bricks; + uint8_t redundancy; + uint32_t chunk_size; +}; + +struct _ec_fd { + loc_t loc; + uintptr_t open; + int32_t flags; +}; + +struct _ec_inode { + ec_lock_t *inode_lock; + gf_boolean_t have_info; + gf_boolean_t have_config; + gf_boolean_t have_version; + gf_boolean_t have_size; + ec_config_t config; + uint64_t pre_version[2]; + uint64_t post_version[2]; + uint64_t pre_size; + uint64_t post_size; + uint64_t dirty[2]; + struct list_head heal; +}; + +typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, + int32_t, uintptr_t, uintptr_t, uintptr_t, + dict_t *); +typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, + int32_t, uintptr_t, uintptr_t, uintptr_t, + dict_t *); + +union _ec_cbk { + fop_access_cbk_t access; + fop_create_cbk_t create; + fop_discard_cbk_t discard; + fop_entrylk_cbk_t entrylk; + fop_fentrylk_cbk_t fentrylk; + fop_fallocate_cbk_t fallocate; + fop_flush_cbk_t flush; + fop_fsync_cbk_t fsync; + fop_fsyncdir_cbk_t fsyncdir; + fop_getxattr_cbk_t getxattr; + fop_fgetxattr_cbk_t fgetxattr; + fop_heal_cbk_t heal; + fop_fheal_cbk_t fheal; + fop_inodelk_cbk_t inodelk; + fop_finodelk_cbk_t finodelk; + fop_link_cbk_t link; + fop_lk_cbk_t lk; + fop_lookup_cbk_t lookup; + fop_mkdir_cbk_t mkdir; + fop_mknod_cbk_t mknod; + fop_open_cbk_t open; + fop_opendir_cbk_t opendir; + fop_readdir_cbk_t readdir; + fop_readdirp_cbk_t readdirp; + fop_readlink_cbk_t readlink; + fop_readv_cbk_t readv; + fop_removexattr_cbk_t removexattr; + fop_fremovexattr_cbk_t fremovexattr; + fop_rename_cbk_t rename; + fop_rmdir_cbk_t rmdir; + fop_setattr_cbk_t setattr; + fop_fsetattr_cbk_t fsetattr; + fop_setxattr_cbk_t setxattr; + fop_fsetxattr_cbk_t fsetxattr; + fop_stat_cbk_t stat; + fop_fstat_cbk_t fstat; + fop_statfs_cbk_t statfs; + fop_symlink_cbk_t symlink; + fop_truncate_cbk_t truncate; + fop_ftruncate_cbk_t ftruncate; + fop_unlink_cbk_t unlink; + fop_writev_cbk_t writev; + fop_xattrop_cbk_t xattrop; + fop_fxattrop_cbk_t fxattrop; + fop_zerofill_cbk_t zerofill; + fop_seek_cbk_t seek; +}; + +struct _ec_lock { + ec_inode_t *ctx; + gf_timer_t *timer; + + /* List of owners of this lock. All fops added to this list are running + * concurrently. */ + struct list_head owners; + + /* List of fops waiting to be an owner of the lock. Fops are added to this + * list when the current owner has an incompatible access (shared vs + * exclusive) or the lock is not acquired yet. */ + struct list_head waiting; + + /* List of fops that will wait until the next unlock/lock cycle. This + * happens when the currently acquired lock is decided to be released as + * soon as possible. In this case, all frozen fops will be continued only + * after the lock is reacquired. */ + struct list_head frozen; + + int32_t exclusive; + uintptr_t mask; + uintptr_t good_mask; + uintptr_t healing; + uint32_t refs_owners; /* Refs for fops owning the lock */ + uint32_t refs_pending; /* Refs assigned to fops being prepared */ + gf_boolean_t acquired; + gf_boolean_t getting_size; + gf_boolean_t release; + gf_boolean_t query; + fd_t *fd; + loc_t loc; + union { + entrylk_type type; + struct gf_flock flock; + }; +}; + +struct _ec_lock_link { + ec_lock_t *lock; + ec_fop_data_t *fop; + struct list_head owner_list; + struct list_head wait_list; + gf_boolean_t update[2]; + loc_t *base; + uint64_t size; +}; + +struct _ec_fop_data { + int32_t id; + int32_t refs; + int32_t state; + int32_t minimum; + int32_t expected; + int32_t winds; + int32_t jobs; + int32_t error; + ec_fop_data_t *parent; + xlator_t *xl; + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ + int32_t lock_count; + int32_t locked; + ec_lock_link_t locks[2]; + int32_t first_lock; + gf_lock_t lock; + + uint32_t flags; + uint32_t first; + uintptr_t mask; + uintptr_t healing; /*Dispatch is done but call is successful only + if fop->minimum number of subvolumes succeed + which are not healing*/ + uintptr_t remaining; + uintptr_t received; /* Mask of responses */ + uintptr_t good; + + uid_t uid; + gid_t gid; + + ec_wind_f wind; + ec_handler_f handler; + ec_resume_f resume; + ec_cbk_t cbks; + void *data; + ec_heal_t *heal; + struct list_head healer; + + uint64_t user_size; + uint32_t head; + + int32_t use_fd; + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uint32_t uint32; + uint64_t size; + off_t offset; + mode_t mode[2]; + entrylk_cmd entrylk_cmd; + entrylk_type entrylk_type; + gf_xattrop_flags_t xattrop_flags; + dev_t dev; + inode_t *inode; + fd_t *fd; + struct iatt iatt; + char *str[2]; + loc_t loc[2]; + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + gf_seek_what_t seek; +}; + +struct _ec_cbk_data { + struct list_head list; /* item in the sorted list of groups */ + struct list_head answer_list; /* item in the list of answers */ + ec_fop_data_t *fop; + ec_cbk_data_t *next; /* next answer in the same group */ + int32_t idx; + int32_t op_ret; + int32_t op_errno; + int32_t count; + uintptr_t mask; + uint64_t dirty[2]; + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uintptr_t uintptr[3]; + uint64_t size; + uint64_t version[2]; + inode_t *inode; + fd_t *fd; + struct statvfs statvfs; + struct iatt iatt[5]; + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + char *str; + gf_dirent_t entries; + off_t offset; + gf_seek_what_t what; +}; + +enum _ec_gf_opcode { + EC_GF_OP_LOAD, + EC_GF_OP_STORE, + EC_GF_OP_COPY, + EC_GF_OP_XOR2, + EC_GF_OP_XOR3, + EC_GF_OP_XORM, + EC_GF_OP_END +}; + +struct _ec_gf_op { + ec_gf_opcode_t op; + uint32_t arg1; + uint32_t arg2; + uint32_t arg3; +}; + +struct _ec_gf_mul { + uint32_t regs; + uint32_t map[EC_GF_MAX_REGS]; + ec_gf_op_t *ops; +}; + +struct _ec_gf { + uint32_t bits; + uint32_t size; + uint32_t mod; + uint32_t min_ops; + uint32_t max_ops; + uint32_t avg_ops; + uint32_t *log; + uint32_t *pow; + ec_gf_mul_t **table; +}; + +struct _ec_code_gen { + char *name; + char **flags; + uint32_t width; + + void (*prolog)(ec_code_builder_t *builder); + void (*epilog)(ec_code_builder_t *builder); + void (*load)(ec_code_builder_t *builder, uint32_t reg, uint32_t offset, + uint32_t bit); + void (*store)(ec_code_builder_t *builder, uint32_t reg, uint32_t bit); + void (*copy)(ec_code_builder_t *builder, uint32_t dst, uint32_t src); + void (*xor2)(ec_code_builder_t *builder, uint32_t dst, uint32_t src); + void (*xor3)(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, + uint32_t src2); + void (*xorm)(ec_code_builder_t *builder, uint32_t dst, uint32_t offset, + uint32_t bit); +}; + +struct _ec_code { + gf_lock_t lock; + struct list_head spaces; + uint32_t width; + ec_gf_t *gf; + ec_code_gen_t *gen; +}; + +struct _ec_code_arg { + uint32_t value; +}; + +struct _ec_code_op { + ec_gf_opcode_t op; + ec_code_arg_t arg1; + ec_code_arg_t arg2; + ec_code_arg_t arg3; +}; + +struct _ec_code_builder { + ec_code_t *code; + uint64_t address; + uint8_t *data; + uint32_t size; + int32_t error; + uint32_t regs; + uint32_t bits; + uint32_t width; + uint32_t count; + uint32_t base; + uint32_t map[EC_GF_MAX_REGS]; + gf_boolean_t linear; + uint64_t loop; + ec_code_op_t ops[0]; +}; + +struct _ec_code_chunk { + struct list_head list; + size_t size; + ec_code_space_t *space; +}; + +struct _ec_code_space { + struct list_head list; + struct list_head chunks; + ec_code_t *code; + size_t size; +}; + + +union _ec_code_func { + ec_code_func_linear_t linear; + ec_code_func_interleaved_t interleaved; +}; + +struct _ec_matrix_row { + ec_code_func_t func; + uint32_t *values; +}; + +struct _ec_matrix { + struct list_head lru; + uint32_t refs; + uint32_t columns; + uint32_t rows; + uintptr_t mask; + ec_code_t *code; + uint32_t *values; + ec_matrix_row_t row_data[0]; +}; + +struct _ec_matrix_list { + struct list_head lru; + gf_lock_t lock; + uint32_t columns; + uint32_t rows; + uint32_t max; + uint32_t count; + uint32_t width; + uint32_t stripe; + struct mem_pool *pool; + ec_gf_t *gf; + ec_code_t *code; + ec_matrix_t *encode; + ec_matrix_t **objects; +}; + +struct _ec_heal { + struct list_head list; + gf_lock_t lock; + xlator_t *xl; + ec_fop_data_t *fop; + void *data; + ec_fop_data_t *lookup; + loc_t loc; + struct iatt iatt; + char *symlink; + fd_t *fd; + int32_t partial; + int32_t done; + int32_t error; + gf_boolean_t nameheal; + uintptr_t available; + uintptr_t good; + uintptr_t bad; + uintptr_t open; + uintptr_t fixed; + uint64_t offset; + uint64_t size; + uint64_t total_size; + uint64_t version[2]; + uint64_t raw_size; +}; + +struct subvol_healer { + xlator_t *this; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; +}; + +struct _ec_self_heald { + gf_boolean_t iamshd; + gf_boolean_t enabled; + int timeout; + uint32_t max_threads; + uint32_t wait_qlength; + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; +}; + +struct _ec { + xlator_t *xl; + int32_t healers; + int32_t heal_waiters; + int32_t nodes; + int32_t bits_for_nodes; + int32_t fragments; + int32_t redundancy; + uint32_t fragment_size; + uint32_t stripe_size; + int32_t up; + uint32_t idx; + uint32_t xl_up_count; + uintptr_t xl_up; + uint32_t xl_notify_count; + uintptr_t xl_notify; + uintptr_t node_mask; + xlator_t **xl_list; + gf_lock_t lock; + gf_timer_t *timer; + gf_boolean_t shutdown; + gf_boolean_t eager_lock; + uint32_t background_heals; + uint32_t heal_wait_qlen; + struct list_head pending_fops; + struct list_head heal_waiting; + struct list_head healing; + struct mem_pool *fop_pool; + struct mem_pool *cbk_pool; + struct mem_pool *lock_pool; + ec_self_heald_t shd; + char vol_uuid[UUID_SIZE + 1]; + dict_t *leaf_to_subvolid; + ec_read_policy_t read_policy; + ec_matrix_list_t matrix; +}; + +#endif /* __EC_TYPES_H__ */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 659e3fd8108..2aff4374b82 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -12,13 +12,15 @@ #include "statedump.h" #include "compat-errno.h" +#include "ec.h" +#include "ec-messages.h" #include "ec-mem-types.h" +#include "ec-types.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-fops.h" #include "ec-method.h" -#include "ec.h" -#include "ec-messages.h" +#include "ec-code.h" #include "ec-heald.h" #include "events.h" @@ -27,6 +29,7 @@ static char *ec_read_policies[EC_READ_POLICY_MAX + 1] = { [EC_GFID_HASH] = "gfid-hash", [EC_READ_POLICY_MAX] = NULL }; + #define EC_MAX_FRAGMENTS EC_METHOD_MAX_FRAGMENTS /* The maximum number of nodes is derived from the maximum allowed fragments * using the rule that redundancy cannot be equal or greater than the number @@ -207,6 +210,9 @@ void __ec_destroy_private(xlator_t * this) if (ec->leaf_to_subvolid) dict_unref (ec->leaf_to_subvolid); + + ec_method_fini(&ec->matrix); + GF_FREE(ec); } } @@ -255,8 +261,12 @@ reconfigure (xlator_t *this, dict_t *options) { ec_t *ec = this->private; char *read_policy = NULL; + char *extensions = NULL; uint32_t heal_wait_qlen = 0; uint32_t background_heals = 0; + int32_t ret = -1; + + GF_OPTION_RECONF ("cpu-extensions", extensions, options, str, failed); GF_OPTION_RECONF ("self-heal-daemon", ec->shd.enabled, options, bool, failed); @@ -272,17 +282,24 @@ reconfigure (xlator_t *this, dict_t *options) int32, failed); ec_configure_background_heal_opts (ec, background_heals, heal_wait_qlen); - GF_OPTION_RECONF ("read-policy", read_policy, options, str, failed); - if (ec_assign_read_policy (ec, read_policy)) - goto failed; GF_OPTION_RECONF ("shd-max-threads", ec->shd.max_threads, options, uint32, failed); GF_OPTION_RECONF ("shd-wait-qlength", ec->shd.wait_qlength, options, uint32, failed); - return 0; + GF_OPTION_RECONF ("read-policy", read_policy, options, str, failed); + + ret = 0; + if (ec_assign_read_policy (ec, read_policy)) { + ret = -1; + } + + if (!ec_method_update(this, &ec->matrix, extensions)) { + ret = -1; + } + failed: - return -1; + return ret; } glusterfs_event_t @@ -554,6 +571,7 @@ init (xlator_t *this) { ec_t *ec = NULL; char *read_policy = NULL; + char *extensions = NULL; if (this->parents == NULL) { @@ -608,7 +626,16 @@ init (xlator_t *this) goto failed; } - ec_method_initialize(); + GF_OPTION_INIT("cpu-extensions", extensions, str, failed); + + if (!ec_method_init(this, &ec->matrix, ec->fragments, ec->nodes, + ec->nodes * 2, extensions)) { + gf_msg (this->name, GF_LOG_ERROR, 0, EC_MSG_MATRIX_FAILED, + "Failed to initialize matrix management"); + + goto failed; + } + GF_OPTION_INIT ("self-heal-daemon", ec->shd.enabled, bool, failed); GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed); GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed); @@ -1402,5 +1429,13 @@ struct volume_options options[] = .description = "This option can be used to control number of heals" " that can wait in SHD per subvolume" }, + { + .key = { "cpu-extensions" }, + .type = GF_OPTION_TYPE_STR, + .value = { "none", "auto", "x64", "sse", "avx" }, + .default_value = "auto", + .description = "force the cpu extensions to be used to accelerate the " + "galois field computations." + }, { } }; diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 49af5c2daf2..648d444f595 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -11,11 +11,6 @@ #ifndef __EC_H__ #define __EC_H__ -#include "xlator.h" -#include "timer.h" -#include "ec-heald.h" -#include "libxlator.h" - #define EC_XATTR_PREFIX "trusted.ec." #define EC_XATTR_CONFIG EC_XATTR_PREFIX"config" #define EC_XATTR_SIZE EC_XATTR_PREFIX"size" @@ -26,49 +21,4 @@ #define EC_VERSION_SIZE 2 #define EC_SHD_INODE_LRU_LIMIT 10 -typedef enum { - EC_ROUND_ROBIN, - EC_GFID_HASH, - EC_READ_POLICY_MAX -} ec_read_policy_t; - -struct _ec -{ - xlator_t * xl; - int32_t healers; - int32_t heal_waiters; - int32_t nodes; - int32_t bits_for_nodes; - int32_t fragments; - int32_t redundancy; - uint32_t fragment_size; - uint32_t stripe_size; - int32_t up; - uint32_t idx; - uint32_t xl_up_count; - uintptr_t xl_up; - uint32_t xl_notify_count; - uintptr_t xl_notify; - uintptr_t node_mask; - xlator_t ** xl_list; - gf_lock_t lock; - gf_timer_t * timer; - gf_boolean_t shutdown; - gf_boolean_t eager_lock; - uint32_t background_heals; - uint32_t heal_wait_qlen; - struct list_head pending_fops; - struct list_head heal_waiting; - struct list_head healing; - struct mem_pool * fop_pool; - struct mem_pool * cbk_pool; - struct mem_pool * lock_pool; - ec_self_heald_t shd; - char vol_uuid[UUID_SIZE + 1]; - dict_t *leaf_to_subvolid; - ec_read_policy_t read_policy; -}; - -void ec_pending_fops_completed(ec_t *ec); - #endif /* __EC_H__ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index d87082e9e89..53e8a441a2f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3064,6 +3064,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_9_0, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "disperse.cpu-extensions", + .voltype = "cluster/disperse", + .op_version = GD_OP_VERSION_3_9_0, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "cluster.use-compound-fops", .voltype = "cluster/replicate", .value = "off", |