From 16693dfaa7bb8508534e27e85794f348dcba0a4b Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Fri, 16 Jan 2015 16:23:53 -0800
Subject: [PATCH 01/15] port simplifying changes from decaf branch; going to
 make a few more as well

---
 src/ec_point.c   | 145 +++++++++++++++++------------------------------
 src/p448/field.h | 123 ----------------------------------------
 2 files changed, 52 insertions(+), 216 deletions(-)
 delete mode 100644 src/p448/field.h

diff --git a/src/ec_point.c b/src/ec_point.c
index c13279e..a625641 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -53,27 +53,42 @@ field_mulw_scc_wr (
         field_weak_reduce(out);
 }
 
+static __inline__ void
+field_subx (
+    struct field_t *d,
+    const struct field_t *a,
+    const struct field_t *b
+) {
+    field_sub ( d, a, b );
+    field_bias( d, 2 );
+    IF32( field_weak_reduce ( d ) );
+}
+
+static __inline__ void
+field_negx (
+    struct field_t *d,
+    const struct field_t *a
+) {
+    field_neg ( d, a );
+    field_bias( d, 2 );
+    IF32( field_weak_reduce ( d ) );
+}
+
 void
 add_tw_niels_to_tw_extensible (
     struct tw_extensible_t*  d,
     const struct tw_niels_t* e
 ) {
     struct field_t L0, L1;
-    field_sub  (   &L1, &d->y, &d->x );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1, &d->y, &d->x );
     field_mul  (   &L0, &e->a,   &L1 );
     field_add  (   &L1, &d->x, &d->y );
     field_mul  ( &d->y, &e->b,   &L1 );
     field_mul  (   &L1, &d->u, &d->t );
     field_mul  ( &d->x, &e->c,   &L1 );
     field_add  ( &d->u,   &L0, &d->y );
-    field_sub  ( &d->t, &d->y,   &L0 );
-    field_bias ( &d->t,     2 );
-    IF32( field_weak_reduce( &d->t ) );
-    field_sub  ( &d->y, &d->z, &d->x );
-    field_bias ( &d->y,     2 );
-    IF32( field_weak_reduce( &d->y ) );
+    field_subx ( &d->t, &d->y,   &L0 );
+    field_subx ( &d->y, &d->z, &d->x );
     field_add  (   &L0, &d->x, &d->z );
     field_mul  ( &d->z,   &L0, &d->y );
     field_mul  ( &d->x, &d->y, &d->t );
@@ -86,22 +101,16 @@ sub_tw_niels_from_tw_extensible (
     const struct tw_niels_t* e
 ) {
     struct field_t L0, L1;
-    field_sub  (   &L1, &d->y, &d->x );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1, &d->y, &d->x );
     field_mul  (   &L0, &e->b,   &L1 );
     field_add  (   &L1, &d->x, &d->y );
     field_mul  ( &d->y, &e->a,   &L1 );
     field_mul  (   &L1, &d->u, &d->t );
     field_mul  ( &d->x, &e->c,   &L1 );
     field_add  ( &d->u,   &L0, &d->y );
-    field_sub  ( &d->t, &d->y,   &L0 );
-    field_bias ( &d->t,     2 );
-    IF32( field_weak_reduce( &d->t ) );
+    field_subx ( &d->t, &d->y,   &L0 );
     field_add  ( &d->y, &d->x, &d->z );
-    field_sub  (   &L0, &d->z, &d->x );
-    field_bias (   &L0,     2 );
-    IF32( field_weak_reduce(   &L0 ) );
+    field_subx (   &L0, &d->z, &d->x );
     field_mul  ( &d->z,   &L0, &d->y );
     field_mul  ( &d->x, &d->y, &d->t );
     field_mul  ( &d->y,   &L0, &d->u );
@@ -142,9 +151,7 @@ double_tw_extensible (
     field_sub  ( &a->t,   &L1, &a->u );
     field_bias ( &a->t,     3 );
     IF32( field_weak_reduce( &a->t ) );
-    field_sub  (   &L1,   &L0,   &L2 );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1,   &L0,   &L2 );
     field_sqr  ( &a->x, &a->z );
     field_bias ( &a->x,     2-is32 /*is32 ? 1 : 2*/ );
     field_add  ( &a->z, &a->x, &a->x );
@@ -168,9 +175,7 @@ double_extensible (
     field_sub  ( &a->t, &a->u,   &L1 );
     field_bias ( &a->t,     3 );
     IF32( field_weak_reduce( &a->t ) );
-    field_sub  ( &a->u,   &L0,   &L2 );
-    field_bias ( &a->u,     2 );
-    IF32( field_weak_reduce( &a->u ) );
+    field_subx ( &a->u,   &L0,   &L2 );
     field_sqr  ( &a->x, &a->z );
     field_bias ( &a->x,     2 );
     field_add  ( &a->z, &a->x, &a->x );
@@ -195,9 +200,7 @@ twist_and_double (
     field_sub  ( &b->t,   &L0, &b->u );
     field_bias ( &b->t,     3 );
     IF32( field_weak_reduce( &b->t ) );
-    field_sub  (   &L0, &b->z, &b->x );
-    field_bias (   &L0,     2 );
-    IF32( field_weak_reduce(   &L0 ) );
+    field_subx (   &L0, &b->z, &b->x );
     field_sqr  ( &b->x, &a->z );
     field_bias ( &b->x,     2 );
     field_add  ( &b->z, &b->x, &b->x );
@@ -222,9 +225,7 @@ untwist_and_double (
     field_sub  ( &b->t, &b->u,   &L0 );
     field_bias ( &b->t,     3 );
     IF32( field_weak_reduce( &b->t ) );
-    field_sub  ( &b->u, &b->z, &b->x );
-    field_bias ( &b->u,     2 );
-    IF32( field_weak_reduce( &b->u ) );
+    field_subx ( &b->u, &b->z, &b->x );
     field_sqr  ( &b->x, &a->z );
     field_bias ( &b->x,     2-is32 /*is32 ? 1 : 2*/ );
     field_add  ( &b->z, &b->x, &b->x );
@@ -296,9 +297,7 @@ convert_tw_pniels_to_tw_extensible (
     const struct tw_pniels_t* d
 ) {
     field_add  ( &e->u, &d->n.b, &d->n.a );
-    field_sub  ( &e->t, &d->n.b, &d->n.a );
-    field_bias ( &e->t,     2 );
-    IF32( field_weak_reduce( &e->t ) );
+    field_subx ( &e->t, &d->n.b, &d->n.a );
     field_mul  ( &e->x, &d->z, &e->t );
     field_mul  ( &e->y, &d->z, &e->u );
     field_sqr  ( &e->z, &d->z );
@@ -325,28 +324,20 @@ montgomery_step (
 ) {
     struct field_t L0, L1;
     field_add  (   &L0, &a->zd, &a->xd );
-    field_sub  (   &L1, &a->xd, &a->zd );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
-    field_sub  ( &a->zd, &a->xa, &a->za );
-    field_bias ( &a->zd,     2 );
-    IF32( field_weak_reduce( &a->zd ) );
+    field_subx (   &L1, &a->xd, &a->zd );
+    field_subx ( &a->zd, &a->xa, &a->za );
     field_mul  ( &a->xd,   &L0, &a->zd );
     field_add  ( &a->zd, &a->za, &a->xa );
     field_mul  ( &a->za,   &L1, &a->zd );
     field_add  ( &a->xa, &a->za, &a->xd );
     field_sqr  ( &a->zd, &a->xa );
     field_mul  ( &a->xa, &a->z0, &a->zd );
-    field_sub  ( &a->zd, &a->xd, &a->za );
-    field_bias ( &a->zd,     2 );
-    IF32( field_weak_reduce( &a->zd ) );
+    field_subx ( &a->zd, &a->xd, &a->za );
     field_sqr  ( &a->za, &a->zd );
     field_sqr  ( &a->xd,   &L0 );
     field_sqr  (   &L0,   &L1 );
     field_mulw_scc ( &a->zd, &a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
-    field_sub  (   &L1, &a->xd,   &L0 );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1, &a->xd,   &L0 );
     field_mul  ( &a->xd,   &L0, &a->zd );
     field_sub  (   &L0, &a->zd,   &L1 );
     field_bias (   &L0,     4 - 2*is32 /*is32 ? 2 : 4*/ );
@@ -375,19 +366,13 @@ serialize_montgomery (
     mask_t L4, L5, L6;
     struct field_t L0, L1, L2, L3;
     field_mul  (   &L3, &a->z0, &a->zd );
-    field_sub  (   &L1,   &L3, &a->xd );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1,   &L3, &a->xd );
     field_mul  (   &L3, &a->za,   &L1 );
     field_mul  (   &L2, &a->z0, &a->xd );
-    field_sub  (   &L1,   &L2, &a->zd );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1,   &L2, &a->zd );
     field_mul  (   &L0, &a->xa,   &L1 );
     field_add  (   &L2,   &L0,   &L3 );
-    field_sub  (   &L1,   &L3,   &L0 );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_subx (   &L1,   &L3,   &L0 );
     field_mul  (   &L3,   &L1,   &L2 );
     field_copy (   &L2, &a->z0 );
     field_addw (   &L2,     1 );
@@ -427,9 +412,7 @@ serialize_extensible (
     const struct extensible_t* a
 ) {
     struct field_t L0, L1, L2;
-    field_sub  (   &L0, &a->y, &a->z );
-    field_bias (   &L0,     2 );
-    IF32( field_weak_reduce(   &L0 ) );
+    field_subx (   &L0, &a->y, &a->z );
     field_add  (     b, &a->z, &a->y );
     field_mul  (   &L1, &a->z, &a->x );
     field_mul  (   &L2,   &L0,   &L1 );
@@ -477,16 +460,10 @@ twist_even (
     mask_t L0, L1;
     field_sqr  ( &b->y, &a->z );
     field_sqr  ( &b->z, &a->x );
-    field_sub  ( &b->u, &b->y, &b->z );
-    field_bias ( &b->u,     2 );
-    IF32( field_weak_reduce( &b->u ) );
-    field_sub  ( &b->z, &a->z, &a->x );
-    field_bias ( &b->z,     2 );
-    IF32( field_weak_reduce( &b->z ) );
+    field_subx ( &b->u, &b->y, &b->z );
+    field_subx ( &b->z, &a->z, &a->x );
     field_mul  ( &b->y, &b->z, &a->y );
-    field_sub  ( &b->z, &a->z, &a->y );
-    field_bias ( &b->z,     2 );
-    IF32( field_weak_reduce( &b->z ) );
+    field_subx ( &b->z, &a->z, &a->y );
     field_mul  ( &b->x, &b->z, &b->y );
     field_mul  ( &b->t, &b->x, &b->u );
     field_mul  ( &b->y, &b->x, &b->t );
@@ -519,13 +496,9 @@ test_only_twist (
     field_add  ( &b->y, &b->z, &b->z );
     field_add  ( &b->u, &b->y, &b->y );
     IF32( field_weak_reduce( &b->u ) );
-    field_sub  ( &b->y, &a->z, &a->x );
-    field_bias ( &b->y,     2 );
-    IF32( field_weak_reduce( &b->y ) );
+    field_subx ( &b->y, &a->z, &a->x );
     field_mul  ( &b->x, &b->y, &a->y );
-    field_sub  ( &b->z, &a->z, &a->y );
-    field_bias ( &b->z,     2 );
-    IF32( field_weak_reduce( &b->z ) );
+    field_subx ( &b->z, &a->z, &a->y );
     field_mul  ( &b->t, &b->z, &b->x );
     field_mul  (   &L1, &b->t, &b->u );
     field_mul  ( &b->x, &b->t,   &L1 );
@@ -535,14 +508,10 @@ test_only_twist (
     field_mul  ( &b->t, &b->x,   &L1 );
     field_add  (   &L1, &a->y, &a->x );
     IF32( field_weak_reduce(   &L1 ) );
-    field_sub  (   &L0, &a->x, &a->y );
-    field_bias (   &L0,     2 );
-    IF32( field_weak_reduce(   &L0 ) );
+    field_subx (   &L0, &a->x, &a->y );
     field_mul  ( &b->x, &b->t,   &L0 );
     field_add  (   &L0, &b->x,   &L1 );
-    field_sub  ( &b->t,   &L1, &b->x );
-    field_bias ( &b->t,     2 );
-    IF32( field_weak_reduce( &b->t ) );
+    field_subx ( &b->t,   &L1, &b->x );
     field_mul  ( &b->x,   &L0, &b->u );
        L2 = field_is_zero( &b->y );
        L3 = -   L2;
@@ -567,9 +536,7 @@ is_even_pt (
     struct field_t L0, L1, L2;
     field_sqr  (   &L2, &a->z );
     field_sqr  (   &L1, &a->x );
-    field_sub  (   &L0,   &L2,   &L1 );
-    field_bias (   &L0,     2 );
-    field_weak_reduce(   &L0 );
+    field_subx (   &L0,   &L2,   &L1 );
     return field_is_square (   &L0 );
 }
 
@@ -602,9 +569,7 @@ deserialize_affine (
     IF32( field_weak_reduce(   &L3 ) );
     field_copy ( &a->y,   &L1 );
     field_subw ( &a->y,     1 );
-    field_neg  ( &a->x, &a->y );
-    field_bias ( &a->x,     2 );
-    IF32( field_weak_reduce( &a->x ) );
+    field_negx ( &a->x, &a->y );
     field_mul  ( &a->y, &a->x,   &L3 );
     field_sqr  (   &L2, &a->x );
     field_mul  (   &L0,   &L2, &a->y );
@@ -641,9 +606,7 @@ deserialize_and_twist_approx (
     IF32( field_weak_reduce( &a->y ) );
     field_sqr  ( &a->x, &a->z );
     field_subw ( &a->x,     1 );
-    field_neg  ( &a->u, &a->x );
-    field_bias ( &a->u,     2 );
-    IF32( field_weak_reduce( &a->u ) );
+    field_negx ( &a->u, &a->x );
     field_mul  ( &a->x,  sdm1, &a->u );
     field_mul  (   &L0, &a->x, &a->y );
     field_mul  ( &a->t,   &L0, &a->y );
@@ -659,9 +622,7 @@ deserialize_and_twist_approx (
     field_mul  (   &L0, &a->u, &a->x );
     field_copy ( &a->x, &a->z );
     field_subw ( &a->x,     1 );
-    field_neg  (   &L1, &a->x );
-    field_bias (   &L1,     2 );
-    IF32( field_weak_reduce(   &L1 ) );
+    field_negx (   &L1, &a->x );
     field_mul  ( &a->x,   &L1,   &L0 );
     field_mul  (   &L0, &a->u, &a->y );
     field_addw ( &a->z,     1 );
@@ -772,9 +733,7 @@ elligator_2s_inject (
     field_sqr  (   &L3, &a->x );
     field_copy ( &a->y,   &L3 );
     field_subw ( &a->y,     1 );
-    field_neg  (   &L4, &a->y );
-    field_bias (   &L4,     2 );
-    IF32( field_weak_reduce(   &L4 ) );
+    field_negx (   &L4, &a->y );
     field_sqr  (   &L2,   &L4 );
     field_mulw (   &L7,   &L2, (EDWARDS_D-1)*(EDWARDS_D-1) );
     field_mulw (   &L8,   &L3, 4*(EDWARDS_D+1)*(EDWARDS_D+1) );
diff --git a/src/p448/field.h b/src/p448/field.h
deleted file mode 100644
index bf36e95..0000000
--- a/src/p448/field.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * @file field.h
- * @brief Field switch code.
- * @copyright
- *   Copyright (c) 2014 Cryptography Research, Inc.  \n
- *   Released under the MIT License.  See LICENSE.txt for license information.
- * @author Mike Hamburg
- */
-#ifndef __FIELD_H__
-#define __FIELD_H__
-
-#include <string.h>
-#include "constant_time.h"
-
-#include "p448.h"
-#define FIELD_BITS           448
-#define field_t              p448_t
-#define field_mul            p448_mul
-#define field_sqr            p448_sqr
-#define field_add            p448_add
-#define field_sub            p448_sub
-#define field_mulw           p448_mulw
-#define field_addw           p448_addw
-#define field_subw           p448_subw
-#define field_neg            p448_neg
-#define field_set_ui         p448_set_ui
-#define field_bias           p448_bias
-#define field_cond_neg       p448_cond_neg
-#define field_inverse        p448_inverse
-#define field_eq             p448_eq
-#define field_isr            p448_isr
-#define field_simultaneous_invert p448_simultaneous_invert
-#define field_weak_reduce    p448_weak_reduce
-#define field_strong_reduce  p448_strong_reduce
-#define field_serialize      p448_serialize
-#define field_deserialize    p448_deserialize
-#define field_is_zero        p448_is_zero
-
-/** @brief Bytes in a field element */
-#define FIELD_BYTES          (1+(FIELD_BITS-1)/8)
-
-/** @brief Words in a field element */
-#define FIELD_WORDS          (1+(FIELD_BITS-1)/sizeof(word_t))
-
-/**
- * @brief For GMP tests: little-endian representation of the field modulus.
- */
-extern const uint8_t FIELD_MODULUS[FIELD_BYTES];
-
-/**
- * Copy one field element to another.
- */
-static inline void
-__attribute__((unused,always_inline))        
-field_copy (
-    struct field_t *__restrict__ a,
-    const struct field_t *__restrict__ b
-) {
-    memcpy(a,b,sizeof(*a));
-}
-
-/**
- * Negate a in place if doNegate.
- */
-static inline void
-__attribute__((unused,always_inline)) 
-field_cond_neg(
-    field_t *a,
-    mask_t doNegate
-) {
-	struct field_t negated;
-    field_neg(&negated, a);
-    field_bias(&negated, 2);
-	constant_time_select(a, &negated, a, sizeof(negated), doNegate);
-}
-
-/**
- * Returns 1/sqrt(+- x).
- * 
- * The Legendre symbol of the result is the same as that of the
- * input.
- * 
- * If x=0, returns 0.
- */
-void
-field_isr (
-    struct field_t*       a,
-    const struct field_t* x
-);
-    
-/**
- * Batch inverts out[i] = 1/in[i]
- * 
- * If any input is zero, all the outputs will be zero.
- */     
-void
-field_simultaneous_invert (
-    struct field_t *__restrict__ out,
-    const struct field_t *in,
-    unsigned int n
-);
-
-/**
- * Returns 1/x.
- * 
- * If x=0, returns 0.
- */
-void
-field_inverse (
-    struct field_t*       a,
-    const struct field_t* x
-);
-
-/**
- * Returns -1 if a==b, 0 otherwise.
- */
-mask_t
-field_eq (
-    const struct field_t *a,
-    const struct field_t *b
-);
-
-#endif /* __FIELD_H__ */

From 942066a16dcb76ae01c5d1187f033022c5e28ae8 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 14:37:45 -0800
Subject: [PATCH 02/15] remove probably-unnecessary optimizations (still needs
 benching to make sure)

---
 src/arithmetic.c                       |   5 +-
 src/ec_point.c                         | 398 +++++++++----------------
 src/include/field.h                    | 123 +++++++-
 src/p448/arch_32/p448.h                |  12 +-
 src/p448/arch_arm_32/p448.h            |  12 +-
 src/p448/arch_neon/p448.h              |  12 +-
 src/p448/arch_neon_experimental/p448.h |  12 +-
 src/p448/arch_ref64/p448.h             |  12 +-
 src/p448/arch_x86_64/p448.h            |  12 +-
 src/p448/f_field.h                     |  11 +-
 src/p480/arch_x86_64/p480.h            |  12 +-
 src/p480/f_field.h                     |  11 +-
 src/p521/arch_ref64/p521.h             |  12 +-
 src/p521/arch_x86_64_r12/p521.h        |  12 +-
 src/p521/f_field.h                     |   9 +-
 test/bench.c                           |   5 -
 test/test_arithmetic.c                 |  10 +-
 17 files changed, 321 insertions(+), 359 deletions(-)

diff --git a/src/arithmetic.c b/src/arithmetic.c
index add3b49..4530aa3 100644
--- a/src/arithmetic.c
+++ b/src/arithmetic.c
@@ -21,7 +21,7 @@ field_eq (
     field_copy(&rb, b);
     field_weak_reduce(&ra);
     field_weak_reduce(&rb);
-    field_sub(&ra, &ra, &rb);
+    field_sub_RAW(&ra, &ra, &rb);
     field_bias(&ra, 2);
     return field_is_zero(&ra);
 }
@@ -47,8 +47,7 @@ field_is_square (
     field_isr  (   &L0,     x );
     field_sqr  (   &L1,   &L0 );
     field_mul  (   &L0,     x,   &L1 );
-    field_subw (   &L0,     1 );
-    field_bias (   &L0,     1 );
+    field_subw(   &L0,     1 );
        L3 = field_is_zero(   &L0 );
        L2 = field_is_zero(     x );
     return    L3 |    L2;
diff --git a/src/ec_point.c b/src/ec_point.c
index a625641..a486df1 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -12,84 +12,23 @@
 #include "ec_point.h"
 #include "magic.h"
 
-#define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
-/* TODO XXX PERF FIXME: better detection of overflow conditions */
-
-/* I wanted to just use if (is32)
- * But clang's -Wunreachable-code flags it.
- * I wanted to keep that warning on.
- */
-#if (is32)
-#define IF32(s) (s)
-#else
-#define IF32(s)
-#endif
-
-/* Multiply by signed curve constant */
-static __inline__ void
-field_mulw_scc (
-    struct field_t* __restrict__ out,
-    const struct field_t *a,
-    int64_t scc
-) {
-    if (scc >= 0) {
-        field_mulw(out, a, scc);
-    } else {
-        field_mulw(out, a, -scc);
-        field_neg(out,out);
-        field_bias(out,2);
-    }
-}
-
-/* Multiply by signed curve constant and weak reduce if biased */
-static __inline__ void
-field_mulw_scc_wr (
-    struct field_t* __restrict__ out,
-    const struct field_t *a,
-    int64_t scc
-) {
-    field_mulw_scc(out, a, scc);
-    if (scc < 0)
-        field_weak_reduce(out);
-}
-
-static __inline__ void
-field_subx (
-    struct field_t *d,
-    const struct field_t *a,
-    const struct field_t *b
-) {
-    field_sub ( d, a, b );
-    field_bias( d, 2 );
-    IF32( field_weak_reduce ( d ) );
-}
-
-static __inline__ void
-field_negx (
-    struct field_t *d,
-    const struct field_t *a
-) {
-    field_neg ( d, a );
-    field_bias( d, 2 );
-    IF32( field_weak_reduce ( d ) );
-}
-
 void
 add_tw_niels_to_tw_extensible (
     struct tw_extensible_t*  d,
     const struct tw_niels_t* e
 ) {
+    ANALYZE_THIS_ROUTINE_CAREFULLY;
     struct field_t L0, L1;
-    field_subx (   &L1, &d->y, &d->x );
+    field_sub (   &L1, &d->y, &d->x );
     field_mul  (   &L0, &e->a,   &L1 );
-    field_add  (   &L1, &d->x, &d->y );
+    field_add_nr  (   &L1, &d->x, &d->y );
     field_mul  ( &d->y, &e->b,   &L1 );
     field_mul  (   &L1, &d->u, &d->t );
     field_mul  ( &d->x, &e->c,   &L1 );
-    field_add  ( &d->u,   &L0, &d->y );
-    field_subx ( &d->t, &d->y,   &L0 );
-    field_subx ( &d->y, &d->z, &d->x );
-    field_add  (   &L0, &d->x, &d->z );
+    field_add_nr  ( &d->u,   &L0, &d->y );
+    field_subx_nr ( &d->t, &d->y,   &L0 );
+    field_subx_nr ( &d->y, &d->z, &d->x );
+    field_add_nr  (   &L0, &d->x, &d->z );
     field_mul  ( &d->z,   &L0, &d->y );
     field_mul  ( &d->x, &d->y, &d->t );
     field_mul  ( &d->y,   &L0, &d->u );
@@ -100,17 +39,18 @@ sub_tw_niels_from_tw_extensible (
     struct tw_extensible_t*  d,
     const struct tw_niels_t* e
 ) {
+    ANALYZE_THIS_ROUTINE_CAREFULLY;
     struct field_t L0, L1;
-    field_subx (   &L1, &d->y, &d->x );
+    field_subx_nr (   &L1, &d->y, &d->x );
     field_mul  (   &L0, &e->b,   &L1 );
-    field_add  (   &L1, &d->x, &d->y );
+    field_add_nr  (   &L1, &d->x, &d->y );
     field_mul  ( &d->y, &e->a,   &L1 );
     field_mul  (   &L1, &d->u, &d->t );
     field_mul  ( &d->x, &e->c,   &L1 );
-    field_add  ( &d->u,   &L0, &d->y );
-    field_subx ( &d->t, &d->y,   &L0 );
-    field_add  ( &d->y, &d->x, &d->z );
-    field_subx (   &L0, &d->z, &d->x );
+    field_add_nr  ( &d->u,   &L0, &d->y );
+    field_subx_nr ( &d->t, &d->y,   &L0 );
+    field_add_nr  ( &d->y, &d->x, &d->z );
+    field_subx_nr (   &L0, &d->z, &d->x );
     field_mul  ( &d->z,   &L0, &d->y );
     field_mul  ( &d->x, &d->y, &d->t );
     field_mul  ( &d->y,   &L0, &d->u );
@@ -142,20 +82,21 @@ void
 double_tw_extensible (
     struct tw_extensible_t* a
 ) {
+    ANALYZE_THIS_ROUTINE_CAREFULLY;
     struct field_t L0, L1, L2;
     field_sqr  (   &L2, &a->x );
     field_sqr  (   &L0, &a->y );
-    field_add  ( &a->u,   &L2,   &L0 );
-    field_add  ( &a->t, &a->y, &a->x );
+    field_add_nr  ( &a->u,   &L2,   &L0 );
+    field_add_nr  ( &a->t, &a->y, &a->x );
     field_sqr  (   &L1, &a->t );
-    field_sub  ( &a->t,   &L1, &a->u );
+    field_sub_nr  ( &a->t,   &L1, &a->u );
     field_bias ( &a->t,     3 );
     IF32( field_weak_reduce( &a->t ) );
-    field_subx (   &L1,   &L0,   &L2 );
+    field_subx_nr (   &L1,   &L0,   &L2 );
     field_sqr  ( &a->x, &a->z );
     field_bias ( &a->x,     2-is32 /*is32 ? 1 : 2*/ );
-    field_add  ( &a->z, &a->x, &a->x );
-    field_sub  (   &L0, &a->z,   &L1 );
+    field_add_nr  ( &a->z, &a->x, &a->x );
+    field_sub_nr  (   &L0, &a->z,   &L1 );
     IF32( field_weak_reduce(   &L0 ) );
     field_mul  ( &a->z,   &L1,   &L0 );
     field_mul  ( &a->x,   &L0, &a->t );
@@ -166,20 +107,21 @@ void
 double_extensible (
     struct extensible_t* a
 ) {
+    ANALYZE_THIS_ROUTINE_CAREFULLY;
     struct field_t L0, L1, L2;
     field_sqr  (   &L2, &a->x );
     field_sqr  (   &L0, &a->y );
-    field_add  (   &L1,   &L2,   &L0 );
-    field_add  ( &a->t, &a->y, &a->x );
+    field_add_nr  (   &L1,   &L2,   &L0 );
+    field_add_nr  ( &a->t, &a->y, &a->x );
     field_sqr  ( &a->u, &a->t );
-    field_sub  ( &a->t, &a->u,   &L1 );
+    field_sub_nr  ( &a->t, &a->u,   &L1 );
     field_bias ( &a->t,     3 );
     IF32( field_weak_reduce( &a->t ) );
-    field_subx ( &a->u,   &L0,   &L2 );
+    field_subx_nr ( &a->u,   &L0,   &L2 );
     field_sqr  ( &a->x, &a->z );
     field_bias ( &a->x,     2 );
-    field_add  ( &a->z, &a->x, &a->x );
-    field_sub  (   &L0, &a->z,   &L1 );
+    field_add_nr  ( &a->z, &a->x, &a->x );
+    field_sub_nr  (   &L0, &a->z,   &L1 );
     IF32( field_weak_reduce(   &L0 ) );
     field_mul  ( &a->z,   &L1,   &L0 );
     field_mul  ( &a->x,   &L0, &a->t );
@@ -194,18 +136,14 @@ twist_and_double (
     struct field_t L0;
     field_sqr  ( &b->x, &a->x );
     field_sqr  ( &b->z, &a->y );
-    field_add  ( &b->u, &b->x, &b->z );
-    field_add  ( &b->t, &a->y, &a->x );
+    field_add ( &b->u, &b->x, &b->z );
+    field_add ( &b->t, &a->y, &a->x );
     field_sqr  (   &L0, &b->t );
-    field_sub  ( &b->t,   &L0, &b->u );
-    field_bias ( &b->t,     3 );
-    IF32( field_weak_reduce( &b->t ) );
-    field_subx (   &L0, &b->z, &b->x );
+    field_sub ( &b->t,   &L0, &b->u );
+    field_sub (   &L0, &b->z, &b->x );
     field_sqr  ( &b->x, &a->z );
-    field_bias ( &b->x,     2 );
-    field_add  ( &b->z, &b->x, &b->x );
-    field_sub  ( &b->y, &b->z, &b->u );
-    IF32( field_weak_reduce( &b->y ) );
+    field_add ( &b->z, &b->x, &b->x );
+    field_sub ( &b->y, &b->z, &b->u );
     field_mul  ( &b->z,   &L0, &b->y );
     field_mul  ( &b->x, &b->y, &b->t );
     field_mul  ( &b->y,   &L0, &b->u );
@@ -219,18 +157,14 @@ untwist_and_double (
     struct field_t L0;
     field_sqr  ( &b->x, &a->x );
     field_sqr  ( &b->z, &a->y );
-    field_add  (   &L0, &b->x, &b->z );
-    field_add  ( &b->t, &a->y, &a->x );
+    field_add (   &L0, &b->x, &b->z );
+    field_add ( &b->t, &a->y, &a->x );
     field_sqr  ( &b->u, &b->t );
-    field_sub  ( &b->t, &b->u,   &L0 );
-    field_bias ( &b->t,     3 );
-    IF32( field_weak_reduce( &b->t ) );
-    field_subx ( &b->u, &b->z, &b->x );
+    field_sub ( &b->t, &b->u,   &L0 );
+    field_sub ( &b->u, &b->z, &b->x );
     field_sqr  ( &b->x, &a->z );
-    field_bias ( &b->x,     2-is32 /*is32 ? 1 : 2*/ );
-    field_add  ( &b->z, &b->x, &b->x );
-    field_sub  ( &b->y, &b->z, &b->u );
-    IF32( field_weak_reduce( &b->y ) );
+    field_add ( &b->z, &b->x, &b->x );
+    field_sub ( &b->y, &b->z, &b->u );
     field_mul  ( &b->z,   &L0, &b->y );
     field_mul  ( &b->x, &b->y, &b->t );
     field_mul  ( &b->y,   &L0, &b->u );
@@ -241,11 +175,8 @@ convert_tw_affine_to_tw_pniels (
     struct tw_pniels_t*       b,
     const struct tw_affine_t* a
 ) {
-    field_sub  ( &b->n.a, &a->y, &a->x );
-    field_bias ( &b->n.a,     2 );
-    field_weak_reduce( &b->n.a );
-    field_add  ( &b->n.b, &a->x, &a->y );
-    field_weak_reduce( &b->n.b );
+    field_sub ( &b->n.a, &a->y, &a->x );
+    field_add ( &b->n.b, &a->x, &a->y );
     field_mul  ( &b->z, &a->y, &a->x );
     field_mulw_scc_wr ( &b->n.c, &b->z, 2*EDWARDS_D-2 );
     field_set_ui( &b->z,     2 );
@@ -280,15 +211,11 @@ convert_tw_extensible_to_tw_pniels (
     struct tw_pniels_t*           b,
     const struct tw_extensible_t* a
 ) {
-    field_sub  ( &b->n.a, &a->y, &a->x );
-    field_bias ( &b->n.a,     2 );
-    field_weak_reduce( &b->n.a );
-    field_add  ( &b->n.b, &a->x, &a->y );
-    field_weak_reduce( &b->n.b );
+    field_sub ( &b->n.a, &a->y, &a->x );
+    field_add ( &b->n.b, &a->x, &a->y );
     field_mul  ( &b->z, &a->u, &a->t );
     field_mulw_scc_wr ( &b->n.c, &b->z, 2*EDWARDS_D-2 );
-    field_add  ( &b->z, &a->z, &a->z );
-    field_weak_reduce( &b->z );
+    field_add ( &b->z, &a->z, &a->z );
 }
 
 void
@@ -296,8 +223,8 @@ convert_tw_pniels_to_tw_extensible (
     struct tw_extensible_t*   e,
     const struct tw_pniels_t* d
 ) {
-    field_add  ( &e->u, &d->n.b, &d->n.a );
-    field_subx ( &e->t, &d->n.b, &d->n.a );
+    field_add ( &e->u, &d->n.b, &d->n.a );
+    field_sub ( &e->t, &d->n.b, &d->n.a );
     field_mul  ( &e->x, &d->z, &e->t );
     field_mul  ( &e->y, &d->z, &e->u );
     field_sqr  ( &e->z, &d->z );
@@ -308,11 +235,8 @@ convert_tw_niels_to_tw_extensible (
     struct tw_extensible_t*  e,
     const struct tw_niels_t* d
 ) {
-    field_add  ( &e->y, &d->b, &d->a );
-    field_weak_reduce( &e->y );
-    field_sub  ( &e->x, &d->b, &d->a );
-    field_bias ( &e->x,     2 );
-    field_weak_reduce( &e->x );
+    field_add ( &e->y, &d->b, &d->a );
+    field_sub ( &e->x, &d->b, &d->a );
     field_set_ui( &e->z,     1 );
     field_copy ( &e->t, &e->x );
     field_copy ( &e->u, &e->y );
@@ -322,24 +246,25 @@ void
 montgomery_step (
     struct montgomery_t* a
 ) {
+    ANALYZE_THIS_ROUTINE_CAREFULLY;
     struct field_t L0, L1;
-    field_add  (   &L0, &a->zd, &a->xd );
-    field_subx (   &L1, &a->xd, &a->zd );
-    field_subx ( &a->zd, &a->xa, &a->za );
+    field_add_nr  (   &L0, &a->zd, &a->xd );
+    field_sub (   &L1, &a->xd, &a->zd );
+    field_sub ( &a->zd, &a->xa, &a->za );
     field_mul  ( &a->xd,   &L0, &a->zd );
-    field_add  ( &a->zd, &a->za, &a->xa );
+    field_add_nr  ( &a->zd, &a->za, &a->xa );
     field_mul  ( &a->za,   &L1, &a->zd );
-    field_add  ( &a->xa, &a->za, &a->xd );
+    field_add_nr  ( &a->xa, &a->za, &a->xd );
     field_sqr  ( &a->zd, &a->xa );
     field_mul  ( &a->xa, &a->z0, &a->zd );
-    field_subx ( &a->zd, &a->xd, &a->za );
+    field_sub ( &a->zd, &a->xd, &a->za );
     field_sqr  ( &a->za, &a->zd );
     field_sqr  ( &a->xd,   &L0 );
     field_sqr  (   &L0,   &L1 );
     field_mulw_scc ( &a->zd, &a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
-    field_subx (   &L1, &a->xd,   &L0 );
+    field_sub (   &L1, &a->xd,   &L0 );
     field_mul  ( &a->xd,   &L0, &a->zd );
-    field_sub  (   &L0, &a->zd,   &L1 );
+    field_sub_nr  (   &L0, &a->zd,   &L1 );
     field_bias (   &L0,     4 - 2*is32 /*is32 ? 2 : 4*/ );
     IF32( field_weak_reduce(   &L0 ) );
     field_mul  ( &a->zd,   &L0,   &L1 );
@@ -366,27 +291,26 @@ serialize_montgomery (
     mask_t L4, L5, L6;
     struct field_t L0, L1, L2, L3;
     field_mul  (   &L3, &a->z0, &a->zd );
-    field_subx (   &L1,   &L3, &a->xd );
+    field_sub (   &L1,   &L3, &a->xd );
     field_mul  (   &L3, &a->za,   &L1 );
     field_mul  (   &L2, &a->z0, &a->xd );
-    field_subx (   &L1,   &L2, &a->zd );
+    field_sub (   &L1,   &L2, &a->zd );
     field_mul  (   &L0, &a->xa,   &L1 );
-    field_add  (   &L2,   &L0,   &L3 );
-    field_subx (   &L1,   &L3,   &L0 );
+    field_add (   &L2,   &L0,   &L3 );
+    field_sub (   &L1,   &L3,   &L0 );
     field_mul  (   &L3,   &L1,   &L2 );
     field_copy (   &L2, &a->z0 );
     field_addw (   &L2,     1 );
     field_sqr  (   &L0,   &L2 );
     field_mulw_scc_wr (   &L1,   &L0, EDWARDS_D-1 );
-    field_add  (   &L2, &a->z0, &a->z0 );
-    field_add  (   &L0,   &L2,   &L2 );
-    field_add  (   &L2,   &L0,   &L1 );
-    IF32( field_weak_reduce(   &L2 ) );
+    field_add (   &L2, &a->z0, &a->z0 );
+    field_add (   &L0,   &L2,   &L2 );
+    field_add (   &L2,   &L0,   &L1 );
     field_mul  (   &L0, &a->xd,   &L2 );
        L5 = field_is_zero( &a->zd );
        L6 = -   L5;
     constant_time_mask (   &L1,   &L0, sizeof(L1), L5 );
-    field_add  (   &L2,   &L1, &a->zd );
+    field_add (   &L2,   &L1, &a->zd );
        L4 = ~   L5;
     field_mul  (   &L1,   sbz,   &L3 );
     field_addw (   &L1,    L6 );
@@ -399,8 +323,7 @@ serialize_montgomery (
     field_sqr  (   &L1,   &L0 );
     field_mul  (   &L0,   &L3,   &L1 );
     constant_time_mask (     b,   &L2, sizeof(L1), L4 );
-    field_subw (   &L0,     1 );
-    field_bias (   &L0,     1 );
+    field_subw(   &L0,     1 );
        L5 = field_is_zero(   &L0 );
        L4 = field_is_zero(   sbz );
     return    L5 |    L4;
@@ -412,8 +335,8 @@ serialize_extensible (
     const struct extensible_t* a
 ) {
     struct field_t L0, L1, L2;
-    field_subx (   &L0, &a->y, &a->z );
-    field_add  (     b, &a->z, &a->y );
+    field_sub (   &L0, &a->y, &a->z );
+    field_add (     b, &a->z, &a->y );
     field_mul  (   &L1, &a->z, &a->x );
     field_mul  (   &L2,   &L0,   &L1 );
     field_mul  (   &L1,   &L2,   &L0 );
@@ -432,15 +355,13 @@ untwist_and_double_and_serialize (
 ) {
     struct field_t L0, L1, L2, L3;
     field_mul  (   &L3, &a->y, &a->x );
-    field_add  (     b, &a->y, &a->x );
+    field_add (     b, &a->y, &a->x );
     field_sqr  (   &L1,     b );
-    field_add  (   &L2,   &L3,   &L3 );
-    field_sub  (     b,   &L1,   &L2 );
-    field_bias (     b,     3 );
-    IF32( field_weak_reduce(     b ) );
+    field_add (   &L2,   &L3,   &L3 );
+    field_sub (     b,   &L1,   &L2 );
     field_sqr  (   &L2, &a->z );
     field_sqr  (   &L1,   &L2 );
-    field_add  (   b,     b,     b );
+    field_add (   b,     b,     b );
     field_mulw_scc (     &L2,   b, EDWARDS_D-1 );
     field_mulw_scc (   b,   &L2, EDWARDS_D-1 );
     field_mul  (   &L0,   &L2,   &L1 );
@@ -457,13 +378,12 @@ twist_even (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    mask_t L0, L1;
     field_sqr  ( &b->y, &a->z );
     field_sqr  ( &b->z, &a->x );
-    field_subx ( &b->u, &b->y, &b->z );
-    field_subx ( &b->z, &a->z, &a->x );
+    field_sub ( &b->u, &b->y, &b->z );
+    field_sub ( &b->z, &a->z, &a->x );
     field_mul  ( &b->y, &b->z, &a->y );
-    field_subx ( &b->z, &a->z, &a->y );
+    field_sub ( &b->z, &a->z, &a->y );
     field_mul  ( &b->x, &b->z, &b->y );
     field_mul  ( &b->t, &b->x, &b->u );
     field_mul  ( &b->y, &b->x, &b->t );
@@ -473,10 +393,7 @@ twist_even (
     field_mul  ( &b->t, &b->y, &b->x );
     field_mul  ( &b->x, &a->x, &b->u );
     field_mul  ( &b->y, &a->y, &b->u );
-       L1 = field_is_zero( &b->z );
-       L0 = -   L1;
-    field_addw ( &b->y,    L0 );
-    field_weak_reduce( &b->y );
+    field_addw ( &b->y,    -field_is_zero( &b->z ) );
     field_set_ui( &b->z,     1 );
     field_copy ( &b->t, &b->x );
     field_copy ( &b->u, &b->y );
@@ -487,18 +404,15 @@ test_only_twist (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    mask_t L2, L3;
     struct field_t L0, L1;
     field_sqr  ( &b->u, &a->z );
     field_sqr  ( &b->y, &a->x );
-    field_sub  ( &b->z, &b->u, &b->y );
-    field_bias ( &b->z,     2 );
-    field_add  ( &b->y, &b->z, &b->z );
-    field_add  ( &b->u, &b->y, &b->y );
-    IF32( field_weak_reduce( &b->u ) );
-    field_subx ( &b->y, &a->z, &a->x );
+    field_sub ( &b->z, &b->u, &b->y );
+    field_add ( &b->y, &b->z, &b->z );
+    field_add ( &b->u, &b->y, &b->y );
+    field_sub ( &b->y, &a->z, &a->x );
     field_mul  ( &b->x, &b->y, &a->y );
-    field_subx ( &b->z, &a->z, &a->y );
+    field_sub ( &b->z, &a->z, &a->y );
     field_mul  ( &b->t, &b->z, &b->x );
     field_mul  (   &L1, &b->t, &b->u );
     field_mul  ( &b->x, &b->t,   &L1 );
@@ -506,25 +420,16 @@ test_only_twist (
     field_mul  ( &b->u, &b->t,   &L0 );
     field_sqr  (   &L1,   &L0 );
     field_mul  ( &b->t, &b->x,   &L1 );
-    field_add  (   &L1, &a->y, &a->x );
-    IF32( field_weak_reduce(   &L1 ) );
-    field_subx (   &L0, &a->x, &a->y );
+    field_add (   &L1, &a->y, &a->x );
+    field_sub (   &L0, &a->x, &a->y );
     field_mul  ( &b->x, &b->t,   &L0 );
-    field_add  (   &L0, &b->x,   &L1 );
-    field_subx ( &b->t,   &L1, &b->x );
+    field_add (   &L0, &b->x,   &L1 );
+    field_sub ( &b->t,   &L1, &b->x );
     field_mul  ( &b->x,   &L0, &b->u );
-       L2 = field_is_zero( &b->y );
-       L3 = -   L2;
-    field_addw ( &b->x,    L3 );
-    field_weak_reduce( &b->x );
+    field_addw ( &b->x, -field_is_zero( &b->y ) );
     field_mul  ( &b->y, &b->t, &b->u );
-       L2 = field_is_zero( &b->z );
-       L3 = -   L2;
-    field_addw ( &b->y,    L3 );
-    field_weak_reduce( &b->y );
-       L3 = field_is_zero( &a->y );
-       L2 =    L3 +     1;
-    field_set_ui( &b->z,    L2 );
+    field_addw ( &b->y, -field_is_zero( &b->z ) );
+    field_set_ui( &b->z, 1+field_is_zero( &a->y ) );
     field_copy ( &b->t, &b->x );
     field_copy ( &b->u, &b->y );
 }
@@ -536,7 +441,7 @@ is_even_pt (
     struct field_t L0, L1, L2;
     field_sqr  (   &L2, &a->z );
     field_sqr  (   &L1, &a->x );
-    field_subx (   &L0,   &L2,   &L1 );
+    field_sub (   &L0,   &L2,   &L1 );
     return field_is_square (   &L0 );
 }
 
@@ -547,8 +452,7 @@ is_even_tw (
     struct field_t L0, L1, L2;
     field_sqr  (   &L2, &a->z );
     field_sqr  (   &L1, &a->x );
-    field_add  (   &L0,   &L1,   &L2 );
-    field_weak_reduce(   &L0 );
+    field_add (   &L0,   &L1,   &L2 );
     return field_is_square (   &L0 );
 }
 
@@ -563,13 +467,12 @@ deserialize_affine (
     field_addw (   &L3,     1 );
     field_sqr  (   &L2,   &L3 );
     field_mulw_scc (   &a->x,   &L2, EDWARDS_D-1 ); /* PERF MULW */
-    field_add  (   &L3,   &L1,   &L1 ); /* FIXME: i adjusted the bias here, was it right? */
-    field_add  ( &a->y,   &L3,   &L3 );
-    field_add  (   &L3, &a->y, &a->x );
-    IF32( field_weak_reduce(   &L3 ) );
+    field_add (   &L3,   &L1,   &L1 ); /* FIXME: i adjusted the bias here, was it right? */
+    field_add ( &a->y,   &L3,   &L3 );
+    field_add (   &L3, &a->y, &a->x );
     field_copy ( &a->y,   &L1 );
-    field_subw ( &a->y,     1 );
     field_negx ( &a->x, &a->y );
+    field_addw ( &a->x,     1 );
     field_mul  ( &a->y, &a->x,   &L3 );
     field_sqr  (   &L2, &a->x );
     field_mul  (   &L0,   &L2, &a->y );
@@ -579,12 +482,11 @@ deserialize_affine (
     field_sqr  (   &L2,   &L3 );
     field_mul  (   &L3,   &L0,   &L2 );
     field_mul  (   &L0, &a->x,   &L3 );
-    field_add  (   &L2, &a->y, &a->y );
+    field_add (   &L2, &a->y, &a->y );
     field_mul  ( &a->x,    sz,   &L2 );
     field_addw (   &L1,     1 );
     field_mul  ( &a->y,   &L1,   &L3 );
-    field_subw (   &L0,     1 );
-    field_bias (   &L0,     1 );
+    field_subw(   &L0,     1 );
     return field_is_zero(   &L0 );
 }
 
@@ -600,13 +502,12 @@ deserialize_and_twist_approx (
     field_addw ( &a->y,     1 );
     field_sqr  ( &L0, &a->y );
     field_mulw_scc ( &a->x, &L0, EDWARDS_D-1 );
-    field_add  ( &a->y, &a->z, &a->z );
-    field_add  ( &a->u, &a->y, &a->y );
-    field_add  ( &a->y, &a->u, &a->x );
-    IF32( field_weak_reduce( &a->y ) );
+    field_add ( &a->y, &a->z, &a->z );
+    field_add ( &a->u, &a->y, &a->y );
+    field_add ( &a->y, &a->u, &a->x );
     field_sqr  ( &a->x, &a->z );
-    field_subw ( &a->x,     1 );
     field_negx ( &a->u, &a->x );
+    field_addw ( &a->u,     1 );
     field_mul  ( &a->x,  sdm1, &a->u );
     field_mul  (   &L0, &a->x, &a->y );
     field_mul  ( &a->t,   &L0, &a->y );
@@ -618,17 +519,16 @@ deserialize_and_twist_approx (
     field_sqr  (   &L1,   &L0 );
     field_mul  ( &a->u, &a->t,   &L1 );
     field_mul  ( &a->t, &a->x, &a->u );
-    field_add  ( &a->x,    sz,    sz );
+    field_add ( &a->x,    sz,    sz );
     field_mul  (   &L0, &a->u, &a->x );
     field_copy ( &a->x, &a->z );
-    field_subw ( &a->x,     1 );
     field_negx (   &L1, &a->x );
+    field_addw (   &L1,     1 );
     field_mul  ( &a->x,   &L1,   &L0 );
     field_mul  (   &L0, &a->u, &a->y );
     field_addw ( &a->z,     1 );
     field_mul  ( &a->y, &a->z,   &L0 );
-    field_subw ( &a->t,     1 );
-    field_bias ( &a->t,     1 );
+    field_subw( &a->t,     1 );
     mask_t ret = field_is_zero( &a->t );
     field_set_ui( &a->z,     1 );
     field_copy ( &a->t, &a->x );
@@ -673,11 +573,9 @@ eq_affine (
 ) {
     mask_t L1, L2;
     struct field_t L0;
-    field_sub  (   &L0, &a->x, &b->x );
-    field_bias (   &L0,     2 );
+    field_sub (   &L0, &a->x, &b->x );
        L2 = field_is_zero(   &L0 );
-    field_sub  (   &L0, &a->y, &b->y );
-    field_bias (   &L0,     2 );
+    field_sub (   &L0, &a->y, &b->y );
        L1 = field_is_zero(   &L0 );
     return    L2 &    L1;
 }
@@ -691,13 +589,11 @@ eq_extensible (
     struct field_t L0, L1, L2;
     field_mul  (   &L2, &b->z, &a->x );
     field_mul  (   &L1, &a->z, &b->x );
-    field_sub  (   &L0,   &L2,   &L1 );
-    field_bias (   &L0,     2 );
+    field_sub (   &L0,   &L2,   &L1 );
        L4 = field_is_zero(   &L0 );
     field_mul  (   &L2, &b->z, &a->y );
     field_mul  (   &L1, &a->z, &b->y );
-    field_sub  (   &L0,   &L2,   &L1 );
-    field_bias (   &L0,     2 );
+    field_sub (   &L0,   &L2,   &L1 );
        L3 = field_is_zero(   &L0 );
     return    L4 &    L3;
 }
@@ -711,13 +607,11 @@ eq_tw_extensible (
     struct field_t L0, L1, L2;
     field_mul  (   &L2, &b->z, &a->x );
     field_mul  (   &L1, &a->z, &b->x );
-    field_sub  (   &L0,   &L2,   &L1 );
-    field_bias (   &L0,     2 );
+    field_sub (   &L0,   &L2,   &L1 );
        L4 = field_is_zero(   &L0 );
     field_mul  (   &L2, &b->z, &a->y );
     field_mul  (   &L1, &a->z, &b->y );
-    field_sub  (   &L0,   &L2,   &L1 );
-    field_bias (   &L0,     2 );
+    field_sub (   &L0,   &L2,   &L1 );
        L3 = field_is_zero(   &L0 );
     return    L4 &    L3;
 }
@@ -727,22 +621,18 @@ elligator_2s_inject (
     struct affine_t*     a,
     const struct field_t* r
 ) {
-    mask_t L0, L1;
     struct field_t L2, L3, L4, L5, L6, L7, L8;
     field_sqr  ( &a->x,     r );
     field_sqr  (   &L3, &a->x );
     field_copy ( &a->y,   &L3 );
-    field_subw ( &a->y,     1 );
     field_negx (   &L4, &a->y );
+    field_addw (   &L4,     1 );
     field_sqr  (   &L2,   &L4 );
     field_mulw (   &L7,   &L2, (EDWARDS_D-1)*(EDWARDS_D-1) );
     field_mulw (   &L8,   &L3, 4*(EDWARDS_D+1)*(EDWARDS_D+1) );
-    field_add  ( &a->y,   &L8,   &L7 );
-    IF32( field_weak_reduce( &a->y ) );
+    field_add ( &a->y,   &L8,   &L7 );
     field_mulw (   &L8,   &L2, 4*(EDWARDS_D)*(EDWARDS_D-1) );
-    field_sub  (   &L7, &a->y,   &L8 );
-    field_bias (   &L7,     2 );
-    IF32( field_weak_reduce(   &L7 ) );
+    field_sub (   &L7, &a->y,   &L8 );
     field_mulw_scc (   &L6, &a->y, -2-2*EDWARDS_D );
     field_mul  (   &L5,   &L7,   &L6 );
         /* FIXME Stability problem (API stability, not crash) / possible bug.
@@ -769,27 +659,20 @@ elligator_2s_inject (
     field_mul  (   &L8,   &L7,   &L6 );
     field_mul  (   &L7,   &L8,   &L6 );
     field_copy (   &L6, &a->x );
-    field_subw (   &L6,     1 );
     field_addw ( &a->x,     1 );
     field_mul  (   &L5, &a->x,   &L8 );
-    field_sub  ( &a->x,   &L6,   &L5 );
-    field_bias ( &a->x,     3 );
-    IF32( field_weak_reduce( &a->x ) );
+    field_addw (   &L5,     1 );
+    field_sub ( &a->x,   &L6,   &L5 );
     field_mul  (   &L5,   &L4, &a->x );
     field_mulw_scc_wr (   &a->x,   &L5, -2-2*EDWARDS_D );
-    field_add  (   &L4,   &L3,   &L3 );
-    field_add  (   &L3,   &L4,   &L2 );
-    field_subw (   &L3,     2 );
-    field_bias (   &L3,     1 );
-    IF32( field_weak_reduce(   &L3 ) );
+    field_add (   &L4,   &L3,   &L3 );
+    field_add (   &L3,   &L4,   &L2 );
+    field_subw(   &L3,     2 );
     field_mul  (   &L2,   &L3,   &L8 );
     field_mulw (   &L3,   &L2, 2*(EDWARDS_D+1)*(EDWARDS_D-1) );
-    field_add  (   &L2,   &L3, &a->y );
+    field_add (   &L2,   &L3, &a->y );
     field_mul  ( &a->y,   &L7,   &L2 );
-       L1 = field_is_zero(   &L8 );
-       L0 = -   L1;
-    field_addw ( &a->y,    L0 );
-    field_weak_reduce( &a->y );
+    field_addw ( &a->y,    -field_is_zero( &L8 ) );
 }
 
 mask_t
@@ -799,12 +682,11 @@ validate_affine (
     struct field_t L0, L1, L2, L3;
     field_sqr  (   &L0, &a->y );
     field_sqr  (   &L1, &a->x );
-    field_add  (   &L3,   &L1,   &L0 );
-    field_subw (   &L3,     1 );
+    field_add (   &L3,   &L1,   &L0 );
     field_mulw_scc (   &L2,   &L1, EDWARDS_D );
     field_mul  (   &L1,   &L0,   &L2 );
-    field_sub  (   &L0,   &L3,   &L1 );
-    field_bias (   &L0,     3 );
+    field_addw (   &L1,     1 );
+    field_sub (   &L0,   &L3,   &L1 );
     return field_is_zero(   &L0 );
 }
 
@@ -821,28 +703,26 @@ validate_tw_extensible (
     field_mul  (   &L1, &ext->t, &ext->u );
     field_mul  (   &L2, &ext->z,   &L1 );
     field_mul  (   &L0, &ext->x, &ext->y );
-    field_neg  (   &L1,   &L0 );
-    field_add  (   &L0,   &L1,   &L2 );
-    field_bias (   &L0,     2 );
+    field_negx (   &L1,   &L0 );
+    field_add (   &L0,   &L1,   &L2 );
        L5 = field_is_zero(   &L0 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
     field_sqr  (   &L2, &ext->y );
-    field_neg  (   &L1,   &L2 );
+    field_negx (   &L1,   &L2 );
     field_sqr  (   &L0, &ext->x );
-    field_add  (   &L2,   &L0,   &L1 );
+    field_add (   &L2,   &L0,   &L1 );
     field_sqr  (   &L3, &ext->u );
     field_sqr  (   &L0, &ext->t );
     field_mul  (   &L1,   &L0,   &L3 );
     field_mulw_scc (   &L3,   &L1, EDWARDS_D );
-    field_add  (   &L0,   &L3,   &L2 );
-    field_neg  (   &L3,   &L1 );
-    field_add  (   &L2,   &L3,   &L0 );
+    field_add (   &L0,   &L3,   &L2 );
+    field_negx (   &L3,   &L1 );
+    field_add (   &L2,   &L3,   &L0 );
     field_sqr  (   &L1, &ext->z );
-    field_add  (   &L0,   &L1,   &L2 );
-    field_bias (   &L0,     2 );
+    field_add (   &L0,   &L1,   &L2 );
        L4 = field_is_zero(   &L0 );
     return    L5 & L4 &~ field_is_zero(&ext->z);
 }
@@ -858,18 +738,17 @@ validate_extensible (
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
     field_sqr  (   &L2, &ext->y );
-    field_neg  (   &L1,   &L2 );
+    field_negx (   &L1,   &L2 );
     field_sqr  (   &L0, &ext->z );
-    field_add  (   &L2,   &L0,   &L1 );
+    field_add (   &L2,   &L0,   &L1 );
     field_sqr  (   &L3, &ext->u );
     field_sqr  (   &L0, &ext->t );
     field_mul  (   &L1,   &L0,   &L3 );
     field_mulw_scc (   &L0,   &L1, EDWARDS_D );
-    field_add  (   &L1,   &L0,   &L2 );
+    field_add (   &L1,   &L0,   &L2 );
     field_sqr  (   &L0, &ext->x );
-    field_neg  (   &L2,   &L0 );
-    field_add  (   &L0,   &L2,   &L1 );
-    field_bias (   &L0,     2 );
+    field_negx (   &L2,   &L0 );
+    field_add (   &L0,   &L2,   &L1 );
        L5 = field_is_zero(   &L0 );
     /*
      * Check invariant:
@@ -878,9 +757,8 @@ validate_extensible (
     field_mul  (   &L1, &ext->t, &ext->u );
     field_mul  (   &L2, &ext->z,   &L1 );
     field_mul  (   &L0, &ext->x, &ext->y );
-    field_neg  (   &L1,   &L0 );
-    field_add  (   &L0,   &L1,   &L2 );
-    field_bias (   &L0,     2 );
+    field_negx (   &L1,   &L0 );
+    field_add (   &L0,   &L1,   &L2 );
        L4 = field_is_zero(   &L0 );
     return L5 & L4 &~ field_is_zero(&ext->z);
 }
diff --git a/src/include/field.h b/src/include/field.h
index 6a9b0e7..d375c09 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -14,6 +14,13 @@
 #include "f_field.h"
 #include <string.h>
 
+#define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
+#if (is32)
+#define IF32(s) (s)
+#else
+#define IF32(s)
+#endif
+
 /** @brief Bytes in a field element */
 #define FIELD_BYTES          (1+(FIELD_BITS-1)/8)
 
@@ -53,21 +60,6 @@ field_copy (
     memcpy(a,b,sizeof(*a));
 }
 
-/**
- * Negate a in place if doNegate.
- */
-static inline void
-__attribute__((unused,always_inline)) 
-field_cond_neg(
-    field_t *a,
-    mask_t doNegate
-) {
-	struct field_t negated;
-    field_neg(&negated, a);
-    field_bias(&negated, 2);
-	constant_time_select(a, &negated, a, sizeof(negated), doNegate);
-}
-
 /**
  * Returns 1/sqrt(+- x).
  * 
@@ -140,4 +132,105 @@ field_sqrn (
     }
 }
 
+/* Multiply by signed curve constant */
+static __inline__ void
+field_mulw_scc (
+    struct field_t* __restrict__ out,
+    const struct field_t *a,
+    int64_t scc
+) {
+    if (scc >= 0) {
+        field_mulw(out, a, scc);
+    } else {
+        field_mulw(out, a, -scc);
+        field_neg_RAW(out,out);
+        field_bias(out,2);
+    }
+}
+
+/* Multiply by signed curve constant and weak reduce if biased */
+static __inline__ void
+field_mulw_scc_wr (
+    struct field_t* __restrict__ out,
+    const struct field_t *a,
+    int64_t scc
+) {
+    field_mulw_scc(out, a, scc);
+    if (scc < 0)
+        field_weak_reduce(out);
+}
+
+static __inline__ void
+field_subx_RAW (
+    struct field_t *d,
+    const struct field_t *a,
+    const struct field_t *b
+) {
+    field_sub_RAW ( d, a, b );
+    field_bias( d, 2 );
+    IF32( field_weak_reduce ( d ) );
+}
+
+static __inline__ void
+field_sub (
+    struct field_t *d,
+    const struct field_t *a,
+    const struct field_t *b
+) {
+    field_sub_RAW ( d, a, b );
+    field_bias( d, 2 );
+    field_weak_reduce ( d );
+}
+
+static __inline__ void
+field_add (
+    struct field_t *d,
+    const struct field_t *a,
+    const struct field_t *b
+) {
+    field_add_RAW ( d, a, b );
+    field_weak_reduce ( d );
+}
+
+static __inline__ void
+field_subw (
+    struct field_t *d,
+    word_t c
+) {
+    field_subw_RAW ( d, c );
+    field_bias( d, 1 );
+    field_weak_reduce ( d );
+}
+
+static __inline__ void
+field_negx (
+    struct field_t *d,
+    const struct field_t *a
+) {
+    field_neg_RAW ( d, a );
+    field_bias( d, 2 );
+    field_weak_reduce ( d );
+}
+
+/**
+ * Negate a in place if doNegate.
+ */
+static inline void
+__attribute__((unused,always_inline)) 
+field_cond_neg (
+    field_t *a,
+    mask_t doNegate
+) {
+	struct field_t negated;
+    field_negx(&negated, a);
+	constant_time_select(a, &negated, a, sizeof(negated), doNegate);
+}
+
+/** Require the warning annotation on raw routines */
+#define ANALYZE_THIS_ROUTINE_CAREFULLY const int ANNOTATE___ANALYZE_THIS_ROUTINE_CAREFULLY = 0;
+#define MUST_BE_CAREFUL (void) ANNOTATE___ANALYZE_THIS_ROUTINE_CAREFULLY
+#define field_add_nr(a,b,c) { MUST_BE_CAREFUL; field_add_RAW(a,b,c); }
+#define field_sub_nr(a,b,c) { MUST_BE_CAREFUL; field_sub_RAW(a,b,c); }
+#define field_subx_nr(a,b,c) { MUST_BE_CAREFUL; field_subx_RAW(a,b,c); }
+
 #endif // __FIELD_H__
diff --git a/src/p448/arch_32/p448.h b/src/p448/arch_32/p448.h
index cf90611..f0406cd 100644
--- a/src/p448/arch_32/p448.h
+++ b/src/p448/arch_32/p448.h
@@ -24,21 +24,21 @@ p448_set_ui (
 ) __attribute__((unused,always_inline));
 
 static __inline__ void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) __attribute__((unused,always_inline));
@@ -130,7 +130,7 @@ p448_set_ui (
 }
 
 void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -148,7 +148,7 @@ p448_add (
 }
 
 void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -166,7 +166,7 @@ p448_sub (
 }
 
 void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) {
diff --git a/src/p448/arch_arm_32/p448.h b/src/p448/arch_arm_32/p448.h
index cf90611..f0406cd 100644
--- a/src/p448/arch_arm_32/p448.h
+++ b/src/p448/arch_arm_32/p448.h
@@ -24,21 +24,21 @@ p448_set_ui (
 ) __attribute__((unused,always_inline));
 
 static __inline__ void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) __attribute__((unused,always_inline));
@@ -130,7 +130,7 @@ p448_set_ui (
 }
 
 void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -148,7 +148,7 @@ p448_add (
 }
 
 void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -166,7 +166,7 @@ p448_sub (
 }
 
 void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) {
diff --git a/src/p448/arch_neon/p448.h b/src/p448/arch_neon/p448.h
index cf90611..f0406cd 100644
--- a/src/p448/arch_neon/p448.h
+++ b/src/p448/arch_neon/p448.h
@@ -24,21 +24,21 @@ p448_set_ui (
 ) __attribute__((unused,always_inline));
 
 static __inline__ void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) __attribute__((unused,always_inline));
@@ -130,7 +130,7 @@ p448_set_ui (
 }
 
 void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -148,7 +148,7 @@ p448_add (
 }
 
 void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -166,7 +166,7 @@ p448_sub (
 }
 
 void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) {
diff --git a/src/p448/arch_neon_experimental/p448.h b/src/p448/arch_neon_experimental/p448.h
index 144d86c..f7d338a 100644
--- a/src/p448/arch_neon_experimental/p448.h
+++ b/src/p448/arch_neon_experimental/p448.h
@@ -27,21 +27,21 @@ p448_set_ui (
 ) __attribute__((unused,always_inline));
 
 static __inline__ void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) __attribute__((unused,always_inline));
@@ -133,7 +133,7 @@ p448_set_ui (
 }
 
 void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -145,7 +145,7 @@ p448_add (
 }
 
 void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -163,7 +163,7 @@ p448_sub (
 }
 
 void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) {
diff --git a/src/p448/arch_ref64/p448.h b/src/p448/arch_ref64/p448.h
index bf43b79..d6670c3 100644
--- a/src/p448/arch_ref64/p448.h
+++ b/src/p448/arch_ref64/p448.h
@@ -25,21 +25,21 @@ p448_set_ui (
 ) __attribute__((unused));
 
 static __inline__ void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused));
              
 static __inline__ void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused));
              
 static __inline__ void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) __attribute__((unused));
@@ -136,7 +136,7 @@ p448_set_ui (
 }
 
 void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -149,7 +149,7 @@ p448_add (
 }
 
 void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -163,7 +163,7 @@ p448_sub (
 }
 
 void
-p448_neg (
+p448_neg_RAW (
     struct p448_t *out,
     const p448_t *a
 ) {
diff --git a/src/p448/arch_x86_64/p448.h b/src/p448/arch_x86_64/p448.h
index 0772d23..20b7597 100644
--- a/src/p448/arch_x86_64/p448.h
+++ b/src/p448/arch_x86_64/p448.h
@@ -24,21 +24,21 @@ p448_set_ui (
 ) __attribute__((unused,always_inline));
 
 static __inline__ void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p448_neg (
+p448_neg_RAW (
     p448_t *out,
     const p448_t *a
 ) __attribute__((unused,always_inline));
@@ -129,7 +129,7 @@ p448_set_ui (
 }
 
 void
-p448_add (
+p448_add_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -147,7 +147,7 @@ p448_add (
 }
 
 void
-p448_sub (
+p448_sub_RAW (
     p448_t *out,
     const p448_t *a,
     const p448_t *b
@@ -165,7 +165,7 @@ p448_sub (
 }
 
 void
-p448_neg (
+p448_neg_RAW (
     struct p448_t *out,
     const p448_t *a
 ) {
diff --git a/src/p448/f_field.h b/src/p448/f_field.h
index c743c8d..7284194 100644
--- a/src/p448/f_field.h
+++ b/src/p448/f_field.h
@@ -9,23 +9,22 @@
 #ifndef __F_FIELD_H__
 #define __F_FIELD_H__ 1
 
-#include <string.h>
 #include "constant_time.h"
+#include <string.h>
 
 #include "p448.h"
 #define FIELD_BITS           448
 #define field_t              p448_t
 #define field_mul            p448_mul
 #define field_sqr            p448_sqr
-#define field_add            p448_add
-#define field_sub            p448_sub
+#define field_add_RAW        p448_add_RAW
+#define field_sub_RAW        p448_sub_RAW
 #define field_mulw           p448_mulw
 #define field_addw           p448_addw
-#define field_subw           p448_subw
-#define field_neg            p448_neg
+#define field_subw_RAW       p448_subw
+#define field_neg_RAW        p448_neg_RAW
 #define field_set_ui         p448_set_ui
 #define field_bias           p448_bias
-#define field_cond_neg       p448_cond_neg
 #define field_inverse        p448_inverse
 #define field_eq             p448_eq
 #define field_isr            p448_isr
diff --git a/src/p480/arch_x86_64/p480.h b/src/p480/arch_x86_64/p480.h
index a49c6d0..ea841aa 100644
--- a/src/p480/arch_x86_64/p480.h
+++ b/src/p480/arch_x86_64/p480.h
@@ -24,21 +24,21 @@ p480_set_ui (
 ) __attribute__((unused,always_inline));
 
 static __inline__ void
-p480_add (
+p480_add_RAW (
     p480_t *out,
     const p480_t *a,
     const p480_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p480_sub (
+p480_sub_RAW (
     p480_t *out,
     const p480_t *a,
     const p480_t *b
 ) __attribute__((unused,always_inline));
              
 static __inline__ void
-p480_neg (
+p480_neg_RAW (
     p480_t *out,
     const p480_t *a
 ) __attribute__((unused,always_inline));
@@ -129,7 +129,7 @@ p480_set_ui (
 }
 
 void
-p480_add (
+p480_add_RAW (
     p480_t *out,
     const p480_t *a,
     const p480_t *b
@@ -147,7 +147,7 @@ p480_add (
 }
 
 void
-p480_sub (
+p480_sub_RAW (
     p480_t *out,
     const p480_t *a,
     const p480_t *b
@@ -165,7 +165,7 @@ p480_sub (
 }
 
 void
-p480_neg (
+p480_neg_RAW (
     struct p480_t *out,
     const p480_t *a
 ) {
diff --git a/src/p480/f_field.h b/src/p480/f_field.h
index 397f83d..c681bd3 100644
--- a/src/p480/f_field.h
+++ b/src/p480/f_field.h
@@ -9,23 +9,22 @@
 #ifndef __F_FIELD_H__
 #define __F_FIELD_H__ 1
 
-#include <string.h>
 #include "constant_time.h"
+#include <string.h>
 
 #include "p480.h"
 #define FIELD_BITS           480
 #define field_t              p480_t
 #define field_mul            p480_mul
 #define field_sqr            p480_sqr
-#define field_add            p480_add
-#define field_sub            p480_sub
+#define field_add_RAW        p480_add_RAW
+#define field_sub_RAW        p480_sub_RAW
 #define field_mulw           p480_mulw
 #define field_addw           p480_addw
-#define field_subw           p480_subw
-#define field_neg            p480_neg
+#define field_subw_RAW       p480_subw
+#define field_neg_RAW        p480_neg_RAW
 #define field_set_ui         p480_set_ui
 #define field_bias           p480_bias
-#define field_cond_neg       p480_cond_neg
 #define field_inverse        p480_inverse
 #define field_eq             p480_eq
 #define field_isr            p480_isr
diff --git a/src/p521/arch_ref64/p521.h b/src/p521/arch_ref64/p521.h
index c4dbf69..ff458a6 100644
--- a/src/p521/arch_ref64/p521.h
+++ b/src/p521/arch_ref64/p521.h
@@ -25,21 +25,21 @@ p521_set_ui (
 ) __attribute__((unused));
 
 static __inline__ void
-p521_add (
+p521_add_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
 ) __attribute__((unused));
              
 static __inline__ void
-p521_sub (
+p521_sub_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
 ) __attribute__((unused));
              
 static __inline__ void
-p521_neg (
+p521_neg_RAW (
     p521_t *out,
     const p521_t *a
 ) __attribute__((unused));
@@ -136,7 +136,7 @@ p521_set_ui (
 }
 
 void
-p521_add (
+p521_add_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
@@ -149,7 +149,7 @@ p521_add (
 }
 
 void
-p521_sub (
+p521_sub_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
@@ -163,7 +163,7 @@ p521_sub (
 }
 
 void
-p521_neg (
+p521_neg_RAW (
     struct p521_t *out,
     const p521_t *a
 ) {
diff --git a/src/p521/arch_x86_64_r12/p521.h b/src/p521/arch_x86_64_r12/p521.h
index f51e91b..568784b 100644
--- a/src/p521/arch_x86_64_r12/p521.h
+++ b/src/p521/arch_x86_64_r12/p521.h
@@ -29,21 +29,21 @@ p521_set_ui (
 ) __attribute__((unused));
 
 static __inline__ void
-p521_add (
+p521_add_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
 ) __attribute__((unused));
              
 static __inline__ void
-p521_sub (
+p521_sub_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
 ) __attribute__((unused));
              
 static __inline__ void
-p521_neg (
+p521_neg_RAW (
     p521_t *out,
     const p521_t *a
 ) __attribute__((unused));
@@ -147,7 +147,7 @@ p521_set_ui (
 }
 
 void
-p521_add (
+p521_add_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
@@ -159,7 +159,7 @@ p521_add (
 }
 
 void
-p521_sub (
+p521_sub_RAW (
     p521_t *out,
     const p521_t *a,
     const p521_t *b
@@ -171,7 +171,7 @@ p521_sub (
 }
 
 void
-p521_neg (
+p521_neg_RAW (
     struct p521_t *out,
     const p521_t *a
 ) {
diff --git a/src/p521/f_field.h b/src/p521/f_field.h
index f17fe3d..6331072 100644
--- a/src/p521/f_field.h
+++ b/src/p521/f_field.h
@@ -17,15 +17,14 @@
 #define field_t              p521_t
 #define field_mul            p521_mul
 #define field_sqr            p521_sqr
-#define field_add            p521_add
-#define field_sub            p521_sub
+#define field_add_RAW        p521_add_RAW
+#define field_sub_RAW        p521_sub_RAW
 #define field_mulw           p521_mulw
 #define field_addw           p521_addw
-#define field_subw           p521_subw
-#define field_neg            p521_neg
+#define field_subw_RAW       p521_subw
+#define field_neg_RAW        p521_neg_RAW
 #define field_set_ui         p521_set_ui
 #define field_bias           p521_bias
-#define field_cond_neg       p521_cond_neg
 #define field_inverse        p521_inverse
 #define field_eq             p521_eq
 #define field_isr            p521_isr
diff --git a/test/bench.c b/test/bench.c
index ddf8097..31fd9eb 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -177,7 +177,6 @@ int main(int argc, char **argv) {
         field_mul(&c,&b,&a);
         field_sqr(&b,&c);
         field_subw(&b,1);
-        field_bias(&b,1);
         if (!field_is_zero(&b)) {
             printf("ISR validation failure!\n");
             field_print("a", &a);
@@ -232,7 +231,6 @@ int main(int argc, char **argv) {
             convert_affine_to_extensible(&exta,&affine);
             serialize_extensible(&b, &exta);
             field_sub(&c,&b,&a);
-            field_bias(&c,2);
             if (!field_is_zero(&c)) {
                 printf("Reserialize validation failure!\n");
                 field_print("a", &a);
@@ -635,7 +633,6 @@ int main(int argc, char **argv) {
         ignore_result(montgomery_ladder(&b,&a,&z,WORD_BITS,0));
         
         field_sub(&d,&b,&c);
-        field_bias(&d,2);
 		if (!field_is_zero(&d)) {
             printf("Odd ladder validation failure %d!\n", ++failures);
             field_print("a", &a);
@@ -661,7 +658,6 @@ int main(int argc, char **argv) {
         untwist_and_double_and_serialize(&c, &ext);
         
         field_sub(&d,&b,&c);
-        field_bias(&d,2);
         
         if (good && !field_is_zero(&d)){
             printf("Iso+serial validation failure %d!\n", ++failures);
@@ -717,7 +713,6 @@ int main(int argc, char **argv) {
         serialize_extensible(&c, &exta);
         
         field_sub(&d,&b,&c);
-        field_bias(&d,2);
         
         if (!field_is_zero(&d)){
             printf("PreWNAF combo validation failure %d!\n", ++failures);
diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c
index bbdbf43..7c45407 100644
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -83,7 +83,7 @@ static mask_t field_assert_eq_gmp(
     return MASK_SUCCESS;
 }
 
-static mask_t test_add_sub (
+static mask_t test_add_sub_RAW (
     const mpz_t x,
     const mpz_t y,
     word_t word
@@ -95,11 +95,11 @@ static mask_t test_add_sub (
     succ &= mpz_to_field(&yy,y);
     mpz_init(t);
     
-    field_add(&tt,&xx,&yy);
+    field_add_RAW(&tt,&xx,&yy);
     mpz_add(t,x,y);
     succ &= field_assert_eq_gmp("add",&xx,&yy,&tt,t,0,2.1);
     
-    field_sub(&tt,&xx,&yy);
+    field_sub_RAW(&tt,&xx,&yy);
     field_bias(&tt,2);
     mpz_sub(t,x,y);
     succ &= field_assert_eq_gmp("sub",&xx,&yy,&tt,t,0,3.1);
@@ -232,13 +232,13 @@ int test_arithmetic (void) {
         
         word_t word = gmp_urandomm_ui (state, 1ull<<radix_bits);
         
-        succ &= test_add_sub(x,y,word);
+        succ &= test_add_sub_RAW(x,y,word);
         succ &= test_mul_sqr(x,y,word);
         
         if (j < 1000)
             succ &= test_isr(x);
         
-        // TODO: test neg, cond_neg, set_ui, wrd, srd, inv, ...?
+        // TODO: test neg, cond_neg_RAW, set_ui, wrd, srd, inv, ...?
     }
     
     mpz_clear(x);

From 33328a1d2133de809b2ef9b76a1f9b13aa93dedf Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 15:35:58 -0800
Subject: [PATCH 03/15] going to GMP-style element[1] types

---
 src/arithmetic.c        |  77 ++--
 src/ec_point.c          | 955 ++++++++++++++++++++--------------------
 src/goldilocks.c        |  70 +--
 src/include/ec_point.h  |  83 ++--
 src/include/field.h     |  79 ++--
 src/include/magic.h     |   2 +-
 src/include/scalarmul.h |   4 +-
 src/p448/f_arithmetic.c |  54 +--
 src/p448/magic.c        |  16 +-
 src/p480/f_arithmetic.c |  54 +--
 src/p480/magic.c        |  12 +-
 src/p521/f_arithmetic.c |  54 +--
 src/p521/magic.c        |  12 +-
 src/scalarmul.c         |  70 +--
 test/bench.c            | 172 ++++----
 test/test.c             |   2 +-
 test/test.h             |   2 +-
 test/test_arithmetic.c  |  94 ++--
 test/test_pointops.c    |  48 +-
 test/test_scalarmul.c   |  90 ++--
 20 files changed, 974 insertions(+), 976 deletions(-)

diff --git a/src/arithmetic.c b/src/arithmetic.c
index 4530aa3..89be5c4 100644
--- a/src/arithmetic.c
+++ b/src/arithmetic.c
@@ -13,78 +13,75 @@
 
 mask_t
 field_eq (
-    const struct field_t *a,
-    const struct field_t *b
+    const field_a_t a,
+    const field_a_t b
 ) {
-    struct field_t ra, rb;
-    field_copy(&ra, a);
-    field_copy(&rb, b);
-    field_weak_reduce(&ra);
-    field_weak_reduce(&rb);
-    field_sub_RAW(&ra, &ra, &rb);
-    field_bias(&ra, 2);
-    return field_is_zero(&ra);
+    field_a_t ra, rb;
+    field_copy(ra, a);
+    field_copy(rb, b);
+    field_weak_reduce(ra);
+    field_weak_reduce(rb);
+    field_sub_RAW(ra, ra, rb);
+    field_bias(ra, 2);
+    return field_is_zero(ra);
 }
 
 void
 field_inverse (
-    struct field_t*       a,
-    const struct field_t* x
+    field_a_t a,
+    const field_a_t x
 ) {
-    struct field_t L0, L1;
-    field_isr  (   &L0,     x );
-    field_sqr  (   &L1,   &L0 );
-    field_sqr  (   &L0,   &L1 );
-    field_mul  (     a,     x,   &L0 );
+    field_a_t L0, L1;
+    field_isr  (   L0,     x );
+    field_sqr  (   L1,   L0 );
+    field_sqr  (   L0,   L1 );
+    field_mul  (     a,     x,   L0 );
 }
 
 mask_t
 field_is_square (
-    const struct field_t* x
+    const field_a_t x
 ) {
-    mask_t L2, L3;
-    struct field_t L0, L1;
-    field_isr  (   &L0,     x );
-    field_sqr  (   &L1,   &L0 );
-    field_mul  (   &L0,     x,   &L1 );
-    field_subw(   &L0,     1 );
-       L3 = field_is_zero(   &L0 );
-       L2 = field_is_zero(     x );
-    return    L3 |    L2;
+    field_a_t L0, L1;
+    field_isr  (   L0,     x );
+    field_sqr  (   L1,   L0 );
+    field_mul  (   L0,     x,  L1 );
+    field_subw(   L0,     1 );
+    return field_is_zero(   L0 ) | field_is_zero(     x );
 }
 
 void
 field_simultaneous_invert (
-    struct field_t *__restrict__ out,
-    const struct field_t *in,
+    field_a_t *__restrict__ out,
+    const field_a_t *in,
     unsigned int n
 ) {
   if (n==0) {
       return;
   } else if (n==1) {
-      field_inverse(out,in);
+      field_inverse(out[0],in[0]);
       return;
   }
   
-  field_copy(&out[1], &in[0]);
+  field_copy(out[1], in[0]);
   int i;
   for (i=1; i<(int) (n-1); i++) {
-      field_mul(&out[i+1], &out[i], &in[i]);
+      field_mul(out[i+1], out[i], in[i]);
   }
-  field_mul(&out[0], &out[n-1], &in[n-1]);
+  field_mul(out[0], out[n-1], in[n-1]);
   
-  struct field_t tmp;
-  field_inverse(&tmp, &out[0]);
-  field_copy(&out[0], &tmp);
+  field_a_t tmp;
+  field_inverse(tmp, out[0]);
+  field_copy(out[0], tmp);
   
   /* at this point, out[0] = product(in[i]) ^ -1
    * out[i] = product(in[0]..in[i-1]) if i != 0
    */
   for (i=n-1; i>0; i--) {
-      field_mul(&tmp, &out[i], &out[0]);
-      field_copy(&out[i], &tmp);
+      field_mul(tmp, out[i], out[0]);
+      field_copy(out[i], tmp);
       
-      field_mul(&tmp, &out[0], &in[i]);
-      field_copy(&out[0], &tmp);
+      field_mul(tmp, out[0], in[i]);
+      field_copy(out[0], tmp);
   }
 }
diff --git a/src/ec_point.c b/src/ec_point.c
index a486df1..905ba60 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -18,20 +18,20 @@ add_tw_niels_to_tw_extensible (
     const struct tw_niels_t* e
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
-    struct field_t L0, L1;
-    field_sub (   &L1, &d->y, &d->x );
-    field_mul  (   &L0, &e->a,   &L1 );
-    field_add_nr  (   &L1, &d->x, &d->y );
-    field_mul  ( &d->y, &e->b,   &L1 );
-    field_mul  (   &L1, &d->u, &d->t );
-    field_mul  ( &d->x, &e->c,   &L1 );
-    field_add_nr  ( &d->u,   &L0, &d->y );
-    field_subx_nr ( &d->t, &d->y,   &L0 );
-    field_subx_nr ( &d->y, &d->z, &d->x );
-    field_add_nr  (   &L0, &d->x, &d->z );
-    field_mul  ( &d->z,   &L0, &d->y );
-    field_mul  ( &d->x, &d->y, &d->t );
-    field_mul  ( &d->y,   &L0, &d->u );
+    field_a_t L0, L1;
+    field_sub (   L1, d->y, d->x );
+    field_mul  (   L0, e->a,   L1 );
+    field_add_nr  (   L1, d->x, d->y );
+    field_mul  ( d->y, e->b,   L1 );
+    field_mul  (   L1, d->u, d->t );
+    field_mul  ( d->x, e->c,   L1 );
+    field_add_nr  ( d->u,   L0, d->y );
+    field_subx_nr ( d->t, d->y,   L0 );
+    field_subx_nr ( d->y, d->z, d->x );
+    field_add_nr  (   L0, d->x, d->z );
+    field_mul  ( d->z,   L0, d->y );
+    field_mul  ( d->x, d->y, d->t );
+    field_mul  ( d->y,   L0, d->u );
 }
 
 void
@@ -40,20 +40,20 @@ sub_tw_niels_from_tw_extensible (
     const struct tw_niels_t* e
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
-    struct field_t L0, L1;
-    field_subx_nr (   &L1, &d->y, &d->x );
-    field_mul  (   &L0, &e->b,   &L1 );
-    field_add_nr  (   &L1, &d->x, &d->y );
-    field_mul  ( &d->y, &e->a,   &L1 );
-    field_mul  (   &L1, &d->u, &d->t );
-    field_mul  ( &d->x, &e->c,   &L1 );
-    field_add_nr  ( &d->u,   &L0, &d->y );
-    field_subx_nr ( &d->t, &d->y,   &L0 );
-    field_add_nr  ( &d->y, &d->x, &d->z );
-    field_subx_nr (   &L0, &d->z, &d->x );
-    field_mul  ( &d->z,   &L0, &d->y );
-    field_mul  ( &d->x, &d->y, &d->t );
-    field_mul  ( &d->y,   &L0, &d->u );
+    field_a_t L0, L1;
+    field_subx_nr (   L1, d->y, d->x );
+    field_mul  (   L0, e->b,   L1 );
+    field_add_nr  (   L1, d->x, d->y );
+    field_mul  ( d->y, e->a,   L1 );
+    field_mul  (   L1, d->u, d->t );
+    field_mul  ( d->x, e->c,   L1 );
+    field_add_nr  ( d->u,   L0, d->y );
+    field_subx_nr ( d->t, d->y,   L0 );
+    field_add_nr  ( d->y, d->x, d->z );
+    field_subx_nr (   L0, d->z, d->x );
+    field_mul  ( d->z,   L0, d->y );
+    field_mul  ( d->x, d->y, d->t );
+    field_mul  ( d->y,   L0, d->u );
 }
 
 void
@@ -61,9 +61,9 @@ add_tw_pniels_to_tw_extensible (
     struct tw_extensible_t*   e,
     const struct tw_pniels_t* a
 ) {
-    struct field_t L0;
-    field_mul  (   &L0, &e->z, &a->z );
-    field_copy ( &e->z,   &L0 );
+    field_a_t L0;
+    field_mul  (   L0, e->z, a->z );
+    field_copy ( e->z,   L0 );
     add_tw_niels_to_tw_extensible(     e, &a->n );
 }
 
@@ -72,9 +72,9 @@ sub_tw_pniels_from_tw_extensible (
     struct tw_extensible_t*   e,
     const struct tw_pniels_t* a
 ) {
-    struct field_t L0;
-    field_mul  (   &L0, &e->z, &a->z );
-    field_copy ( &e->z,   &L0 );
+    field_a_t L0;
+    field_mul  (   L0, e->z, a->z );
+    field_copy ( e->z,   L0 );
     sub_tw_niels_from_tw_extensible(     e, &a->n );
 }
 
@@ -83,24 +83,24 @@ double_tw_extensible (
     struct tw_extensible_t* a
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L2, &a->x );
-    field_sqr  (   &L0, &a->y );
-    field_add_nr  ( &a->u,   &L2,   &L0 );
-    field_add_nr  ( &a->t, &a->y, &a->x );
-    field_sqr  (   &L1, &a->t );
-    field_sub_nr  ( &a->t,   &L1, &a->u );
-    field_bias ( &a->t,     3 );
-    IF32( field_weak_reduce( &a->t ) );
-    field_subx_nr (   &L1,   &L0,   &L2 );
-    field_sqr  ( &a->x, &a->z );
-    field_bias ( &a->x,     2-is32 /*is32 ? 1 : 2*/ );
-    field_add_nr  ( &a->z, &a->x, &a->x );
-    field_sub_nr  (   &L0, &a->z,   &L1 );
-    IF32( field_weak_reduce(   &L0 ) );
-    field_mul  ( &a->z,   &L1,   &L0 );
-    field_mul  ( &a->x,   &L0, &a->t );
-    field_mul  ( &a->y,   &L1, &a->u );
+    field_a_t L0, L1, L2;
+    field_sqr  (   L2, a->x );
+    field_sqr  (   L0, a->y );
+    field_add_nr  ( a->u,   L2,   L0 );
+    field_add_nr  ( a->t, a->y, a->x );
+    field_sqr  (   L1, a->t );
+    field_sub_nr  ( a->t,   L1, a->u );
+    field_bias ( a->t,     3 );
+    IF32( field_weak_reduce( a->t ) );
+    field_subx_nr (   L1,   L0,   L2 );
+    field_sqr  ( a->x, a->z );
+    field_bias ( a->x,     2-is32 /*is32 ? 1 : 2*/ );
+    field_add_nr  ( a->z, a->x, a->x );
+    field_sub_nr  (   L0, a->z,   L1 );
+    IF32( field_weak_reduce(   L0 ) );
+    field_mul  ( a->z,   L1,   L0 );
+    field_mul  ( a->x,   L0, a->t );
+    field_mul  ( a->y,   L1, a->u );
 }
 
 void
@@ -108,24 +108,24 @@ double_extensible (
     struct extensible_t* a
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L2, &a->x );
-    field_sqr  (   &L0, &a->y );
-    field_add_nr  (   &L1,   &L2,   &L0 );
-    field_add_nr  ( &a->t, &a->y, &a->x );
-    field_sqr  ( &a->u, &a->t );
-    field_sub_nr  ( &a->t, &a->u,   &L1 );
-    field_bias ( &a->t,     3 );
-    IF32( field_weak_reduce( &a->t ) );
-    field_subx_nr ( &a->u,   &L0,   &L2 );
-    field_sqr  ( &a->x, &a->z );
-    field_bias ( &a->x,     2 );
-    field_add_nr  ( &a->z, &a->x, &a->x );
-    field_sub_nr  (   &L0, &a->z,   &L1 );
-    IF32( field_weak_reduce(   &L0 ) );
-    field_mul  ( &a->z,   &L1,   &L0 );
-    field_mul  ( &a->x,   &L0, &a->t );
-    field_mul  ( &a->y,   &L1, &a->u );
+    field_a_t L0, L1, L2;
+    field_sqr  (   L2, a->x );
+    field_sqr  (   L0, a->y );
+    field_add_nr  (   L1,   L2,   L0 );
+    field_add_nr  ( a->t, a->y, a->x );
+    field_sqr  ( a->u, a->t );
+    field_sub_nr  ( a->t, a->u,   L1 );
+    field_bias ( a->t,     3 );
+    IF32( field_weak_reduce( a->t ) );
+    field_subx_nr ( a->u,   L0,   L2 );
+    field_sqr  ( a->x, a->z );
+    field_bias ( a->x,     2 );
+    field_add_nr  ( a->z, a->x, a->x );
+    field_sub_nr  (   L0, a->z,   L1 );
+    IF32( field_weak_reduce(   L0 ) );
+    field_mul  ( a->z,   L1,   L0 );
+    field_mul  ( a->x,   L0, a->t );
+    field_mul  ( a->y,   L1, a->u );
 }
 
 void
@@ -133,20 +133,20 @@ twist_and_double (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    struct field_t L0;
-    field_sqr  ( &b->x, &a->x );
-    field_sqr  ( &b->z, &a->y );
-    field_add ( &b->u, &b->x, &b->z );
-    field_add ( &b->t, &a->y, &a->x );
-    field_sqr  (   &L0, &b->t );
-    field_sub ( &b->t,   &L0, &b->u );
-    field_sub (   &L0, &b->z, &b->x );
-    field_sqr  ( &b->x, &a->z );
-    field_add ( &b->z, &b->x, &b->x );
-    field_sub ( &b->y, &b->z, &b->u );
-    field_mul  ( &b->z,   &L0, &b->y );
-    field_mul  ( &b->x, &b->y, &b->t );
-    field_mul  ( &b->y,   &L0, &b->u );
+    field_a_t L0;
+    field_sqr  ( b->x, a->x );
+    field_sqr  ( b->z, a->y );
+    field_add ( b->u, b->x, b->z );
+    field_add ( b->t, a->y, a->x );
+    field_sqr  (   L0, b->t );
+    field_sub ( b->t,   L0, b->u );
+    field_sub (   L0, b->z, b->x );
+    field_sqr  ( b->x, a->z );
+    field_add ( b->z, b->x, b->x );
+    field_sub ( b->y, b->z, b->u );
+    field_mul  ( b->z,   L0, b->y );
+    field_mul  ( b->x, b->y, b->t );
+    field_mul  ( b->y,   L0, b->u );
 }
 
 void
@@ -154,20 +154,20 @@ untwist_and_double (
     struct extensible_t*          b,
     const struct tw_extensible_t* a
 ) {
-    struct field_t L0;
-    field_sqr  ( &b->x, &a->x );
-    field_sqr  ( &b->z, &a->y );
-    field_add (   &L0, &b->x, &b->z );
-    field_add ( &b->t, &a->y, &a->x );
-    field_sqr  ( &b->u, &b->t );
-    field_sub ( &b->t, &b->u,   &L0 );
-    field_sub ( &b->u, &b->z, &b->x );
-    field_sqr  ( &b->x, &a->z );
-    field_add ( &b->z, &b->x, &b->x );
-    field_sub ( &b->y, &b->z, &b->u );
-    field_mul  ( &b->z,   &L0, &b->y );
-    field_mul  ( &b->x, &b->y, &b->t );
-    field_mul  ( &b->y,   &L0, &b->u );
+    field_a_t L0;
+    field_sqr  ( b->x, a->x );
+    field_sqr  ( b->z, a->y );
+    field_add (   L0, b->x, b->z );
+    field_add ( b->t, a->y, a->x );
+    field_sqr  ( b->u, b->t );
+    field_sub ( b->t, b->u,   L0 );
+    field_sub ( b->u, b->z, b->x );
+    field_sqr  ( b->x, a->z );
+    field_add ( b->z, b->x, b->x );
+    field_sub ( b->y, b->z, b->u );
+    field_mul  ( b->z,   L0, b->y );
+    field_mul  ( b->x, b->y, b->t );
+    field_mul  ( b->y,   L0, b->u );
 }
 
 void
@@ -175,11 +175,11 @@ convert_tw_affine_to_tw_pniels (
     struct tw_pniels_t*       b,
     const struct tw_affine_t* a
 ) {
-    field_sub ( &b->n.a, &a->y, &a->x );
-    field_add ( &b->n.b, &a->x, &a->y );
-    field_mul  ( &b->z, &a->y, &a->x );
-    field_mulw_scc_wr ( &b->n.c, &b->z, 2*EDWARDS_D-2 );
-    field_set_ui( &b->z,     2 );
+    field_sub ( b->n.a, a->y, a->x );
+    field_add ( b->n.b, a->x, a->y );
+    field_mul  ( b->z, a->y, a->x );
+    field_mulw_scc_wr ( b->n.c, b->z, 2*EDWARDS_D-2 );
+    field_set_ui( b->z,     2 );
 }
 
 void
@@ -187,11 +187,11 @@ convert_tw_affine_to_tw_extensible (
     struct tw_extensible_t*   b,
     const struct tw_affine_t* a
 ) {
-    field_copy ( &b->x, &a->x );
-    field_copy ( &b->y, &a->y );
-    field_set_ui( &b->z,     1 );
-    field_copy ( &b->t, &a->x );
-    field_copy ( &b->u, &a->y );
+    field_copy ( b->x, a->x );
+    field_copy ( b->y, a->y );
+    field_set_ui( b->z,     1 );
+    field_copy ( b->t, a->x );
+    field_copy ( b->u, a->y );
 }
 
 void
@@ -199,11 +199,11 @@ convert_affine_to_extensible (
     struct extensible_t*   b,
     const struct affine_t* a
 ) {
-    field_copy ( &b->x, &a->x );
-    field_copy ( &b->y, &a->y );
-    field_set_ui( &b->z,     1 );
-    field_copy ( &b->t, &a->x );
-    field_copy ( &b->u, &a->y );
+    field_copy ( b->x, a->x );
+    field_copy ( b->y, a->y );
+    field_set_ui( b->z,     1 );
+    field_copy ( b->t, a->x );
+    field_copy ( b->u, a->y );
 }
 
 void
@@ -211,11 +211,11 @@ convert_tw_extensible_to_tw_pniels (
     struct tw_pniels_t*           b,
     const struct tw_extensible_t* a
 ) {
-    field_sub ( &b->n.a, &a->y, &a->x );
-    field_add ( &b->n.b, &a->x, &a->y );
-    field_mul  ( &b->z, &a->u, &a->t );
-    field_mulw_scc_wr ( &b->n.c, &b->z, 2*EDWARDS_D-2 );
-    field_add ( &b->z, &a->z, &a->z );
+    field_sub ( b->n.a, a->y, a->x );
+    field_add ( b->n.b, a->x, a->y );
+    field_mul  ( b->z, a->u, a->t );
+    field_mulw_scc_wr ( b->n.c, b->z, 2*EDWARDS_D-2 );
+    field_add ( b->z, a->z, a->z );
 }
 
 void
@@ -223,11 +223,11 @@ convert_tw_pniels_to_tw_extensible (
     struct tw_extensible_t*   e,
     const struct tw_pniels_t* d
 ) {
-    field_add ( &e->u, &d->n.b, &d->n.a );
-    field_sub ( &e->t, &d->n.b, &d->n.a );
-    field_mul  ( &e->x, &d->z, &e->t );
-    field_mul  ( &e->y, &d->z, &e->u );
-    field_sqr  ( &e->z, &d->z );
+    field_add ( e->u, d->n.b, d->n.a );
+    field_sub ( e->t, d->n.b, d->n.a );
+    field_mul  ( e->x, d->z, e->t );
+    field_mul  ( e->y, d->z, e->u );
+    field_sqr  ( e->z, d->z );
 }
 
 void
@@ -235,11 +235,11 @@ convert_tw_niels_to_tw_extensible (
     struct tw_extensible_t*  e,
     const struct tw_niels_t* d
 ) {
-    field_add ( &e->y, &d->b, &d->a );
-    field_sub ( &e->x, &d->b, &d->a );
-    field_set_ui( &e->z,     1 );
-    field_copy ( &e->t, &e->x );
-    field_copy ( &e->u, &e->y );
+    field_add ( e->y, d->b, d->a );
+    field_sub ( e->x, d->b, d->a );
+    field_set_ui( e->z,     1 );
+    field_copy ( e->t, e->x );
+    field_copy ( e->u, e->y );
 }
 
 void
@@ -247,130 +247,130 @@ montgomery_step (
     struct montgomery_t* a
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
-    struct field_t L0, L1;
-    field_add_nr  (   &L0, &a->zd, &a->xd );
-    field_sub (   &L1, &a->xd, &a->zd );
-    field_sub ( &a->zd, &a->xa, &a->za );
-    field_mul  ( &a->xd,   &L0, &a->zd );
-    field_add_nr  ( &a->zd, &a->za, &a->xa );
-    field_mul  ( &a->za,   &L1, &a->zd );
-    field_add_nr  ( &a->xa, &a->za, &a->xd );
-    field_sqr  ( &a->zd, &a->xa );
-    field_mul  ( &a->xa, &a->z0, &a->zd );
-    field_sub ( &a->zd, &a->xd, &a->za );
-    field_sqr  ( &a->za, &a->zd );
-    field_sqr  ( &a->xd,   &L0 );
-    field_sqr  (   &L0,   &L1 );
-    field_mulw_scc ( &a->zd, &a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
-    field_sub (   &L1, &a->xd,   &L0 );
-    field_mul  ( &a->xd,   &L0, &a->zd );
-    field_sub_nr  (   &L0, &a->zd,   &L1 );
-    field_bias (   &L0,     4 - 2*is32 /*is32 ? 2 : 4*/ );
-    IF32( field_weak_reduce(   &L0 ) );
-    field_mul  ( &a->zd,   &L0,   &L1 );
+    field_a_t L0, L1;
+    field_add_nr  (   L0, a->zd, a->xd );
+    field_sub (   L1, a->xd, a->zd );
+    field_sub ( a->zd, a->xa, a->za );
+    field_mul  ( a->xd,   L0, a->zd );
+    field_add_nr  ( a->zd, a->za, a->xa );
+    field_mul  ( a->za,   L1, a->zd );
+    field_add_nr  ( a->xa, a->za, a->xd );
+    field_sqr  ( a->zd, a->xa );
+    field_mul  ( a->xa, a->z0, a->zd );
+    field_sub ( a->zd, a->xd, a->za );
+    field_sqr  ( a->za, a->zd );
+    field_sqr  ( a->xd,   L0 );
+    field_sqr  (   L0,   L1 );
+    field_mulw_scc ( a->zd, a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
+    field_sub (   L1, a->xd,   L0 );
+    field_mul  ( a->xd,   L0, a->zd );
+    field_sub_nr  (   L0, a->zd,   L1 );
+    field_bias (   L0,     4 - 2*is32 /*is32 ? 2 : 4*/ );
+    IF32( field_weak_reduce(   L0 ) );
+    field_mul  ( a->zd,   L0,   L1 );
 }
 
 void
 deserialize_montgomery (
     struct montgomery_t* a,
-    const struct field_t* sbz
+    const field_a_t sbz
 ) {
-    field_sqr  ( &a->z0,   sbz );
-    field_set_ui( &a->xd,     1 );
-    field_set_ui( &a->zd,     0 );
-    field_set_ui( &a->xa,     1 );
-    field_copy ( &a->za, &a->z0 );
+    field_sqr  ( a->z0,   sbz );
+    field_set_ui( a->xd,     1 );
+    field_set_ui( a->zd,     0 );
+    field_set_ui( a->xa,     1 );
+    field_copy ( a->za, a->z0 );
 }
 
 mask_t
 serialize_montgomery (
-    struct field_t*             b,
+    field_a_t             b,
     const struct montgomery_t* a,
-    const struct field_t*       sbz
+    const field_a_t       sbz
 ) {
     mask_t L4, L5, L6;
-    struct field_t L0, L1, L2, L3;
-    field_mul  (   &L3, &a->z0, &a->zd );
-    field_sub (   &L1,   &L3, &a->xd );
-    field_mul  (   &L3, &a->za,   &L1 );
-    field_mul  (   &L2, &a->z0, &a->xd );
-    field_sub (   &L1,   &L2, &a->zd );
-    field_mul  (   &L0, &a->xa,   &L1 );
-    field_add (   &L2,   &L0,   &L3 );
-    field_sub (   &L1,   &L3,   &L0 );
-    field_mul  (   &L3,   &L1,   &L2 );
-    field_copy (   &L2, &a->z0 );
-    field_addw (   &L2,     1 );
-    field_sqr  (   &L0,   &L2 );
-    field_mulw_scc_wr (   &L1,   &L0, EDWARDS_D-1 );
-    field_add (   &L2, &a->z0, &a->z0 );
-    field_add (   &L0,   &L2,   &L2 );
-    field_add (   &L2,   &L0,   &L1 );
-    field_mul  (   &L0, &a->xd,   &L2 );
-       L5 = field_is_zero( &a->zd );
+    field_a_t L0, L1, L2, L3;
+    field_mul  (   L3, a->z0, a->zd );
+    field_sub (   L1,   L3, a->xd );
+    field_mul  (   L3, a->za,   L1 );
+    field_mul  (   L2, a->z0, a->xd );
+    field_sub (   L1,   L2, a->zd );
+    field_mul  (   L0, a->xa,   L1 );
+    field_add (   L2,   L0,   L3 );
+    field_sub (   L1,   L3,   L0 );
+    field_mul  (   L3,   L1,   L2 );
+    field_copy (   L2, a->z0 );
+    field_addw (   L2,     1 );
+    field_sqr  (   L0,   L2 );
+    field_mulw_scc_wr (   L1,   L0, EDWARDS_D-1 );
+    field_add (   L2, a->z0, a->z0 );
+    field_add (   L0,   L2,   L2 );
+    field_add (   L2,   L0,   L1 );
+    field_mul  (   L0, a->xd,   L2 );
+       L5 = field_is_zero( a->zd );
        L6 = -   L5;
-    constant_time_mask (   &L1,   &L0, sizeof(L1), L5 );
-    field_add (   &L2,   &L1, &a->zd );
+    constant_time_mask (   L1,   L0, sizeof(L1), L5 );
+    field_add (   L2,   L1, a->zd );
        L4 = ~   L5;
-    field_mul  (   &L1,   sbz,   &L3 );
-    field_addw (   &L1,    L6 );
-    field_mul  (   &L3,   &L2,   &L1 );
-    field_mul  (   &L1,   &L3,   &L2 );
-    field_mul  (   &L2,   &L3, &a->xd );
-    field_mul  (   &L3,   &L1,   &L2 );
-    field_isr  (   &L0,   &L3 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqr  (   &L1,   &L0 );
-    field_mul  (   &L0,   &L3,   &L1 );
-    constant_time_mask (     b,   &L2, sizeof(L1), L4 );
-    field_subw(   &L0,     1 );
-       L5 = field_is_zero(   &L0 );
+    field_mul  (   L1,   sbz,   L3 );
+    field_addw (   L1,    L6 );
+    field_mul  (   L3,   L2,   L1 );
+    field_mul  (   L1,   L3,   L2 );
+    field_mul  (   L2,   L3, a->xd );
+    field_mul  (   L3,   L1,   L2 );
+    field_isr  (   L0,   L3 );
+    field_mul  (   L2,   L1,   L0 );
+    field_sqr  (   L1,   L0 );
+    field_mul  (   L0,   L3,   L1 );
+    constant_time_mask (     b,   L2, sizeof(L1), L4 );
+    field_subw(   L0,     1 );
+       L5 = field_is_zero(   L0 );
        L4 = field_is_zero(   sbz );
     return    L5 |    L4;
 }
 
 void
 serialize_extensible (
-    struct field_t*             b,
+    field_a_t             b,
     const struct extensible_t* a
 ) {
-    struct field_t L0, L1, L2;
-    field_sub (   &L0, &a->y, &a->z );
-    field_add (     b, &a->z, &a->y );
-    field_mul  (   &L1, &a->z, &a->x );
-    field_mul  (   &L2,   &L0,   &L1 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_mul  (   &L0,   &L2,     b );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_isr  (   &L0,   &L2 );
-    field_mul  (     b,   &L1,   &L0 );
-    field_sqr  (   &L1,   &L0 );
-    field_mul  (   &L0,   &L2,   &L1 );
+    field_a_t L0, L1, L2;
+    field_sub (   L0, a->y, a->z );
+    field_add (     b, a->z, a->y );
+    field_mul  (   L1, a->z, a->x );
+    field_mul  (   L2,   L0,   L1 );
+    field_mul  (   L1,   L2,   L0 );
+    field_mul  (   L0,   L2,     b );
+    field_mul  (   L2,   L1,   L0 );
+    field_isr  (   L0,   L2 );
+    field_mul  (     b,   L1,   L0 );
+    field_sqr  (   L1,   L0 );
+    field_mul  (   L0,   L2,   L1 );
 }
 
 void
 untwist_and_double_and_serialize (
-    struct field_t*                b,
+    field_a_t                b,
     const struct tw_extensible_t* a
 ) {
-    struct field_t L0, L1, L2, L3;
-    field_mul  (   &L3, &a->y, &a->x );
-    field_add (     b, &a->y, &a->x );
-    field_sqr  (   &L1,     b );
-    field_add (   &L2,   &L3,   &L3 );
-    field_sub (     b,   &L1,   &L2 );
-    field_sqr  (   &L2, &a->z );
-    field_sqr  (   &L1,   &L2 );
+    field_a_t L0, L1, L2, L3;
+    field_mul  (   L3, a->y, a->x );
+    field_add (     b, a->y, a->x );
+    field_sqr  (   L1,     b );
+    field_add (   L2,   L3,   L3 );
+    field_sub (     b,   L1,   L2 );
+    field_sqr  (   L2, a->z );
+    field_sqr  (   L1,   L2 );
     field_add (   b,     b,     b );
-    field_mulw_scc (     &L2,   b, EDWARDS_D-1 );
-    field_mulw_scc (   b,   &L2, EDWARDS_D-1 );
-    field_mul  (   &L0,   &L2,   &L1 );
-    field_mul  (   &L2,     b,   &L0 );
-    field_isr  (   &L0,   &L2 );
-    field_mul  (   &L1,     b,   &L0 );
-    field_sqr  (     b,   &L0 );
-    field_mul  (   &L0,   &L2,     b );
-    field_mul  (     b,   &L1,   &L3 );
+    field_mulw_scc (     L2,   b, EDWARDS_D-1 );
+    field_mulw_scc (   b,   L2, EDWARDS_D-1 );
+    field_mul  (   L0,   L2,   L1 );
+    field_mul  (   L2,     b,   L0 );
+    field_isr  (   L0,   L2 );
+    field_mul  (   L1,     b,   L0 );
+    field_sqr  (     b,   L0 );
+    field_mul  (   L0,   L2,     b );
+    field_mul  (     b,   L1,   L3 );
 }
 
 void
@@ -378,25 +378,25 @@ twist_even (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    field_sqr  ( &b->y, &a->z );
-    field_sqr  ( &b->z, &a->x );
-    field_sub ( &b->u, &b->y, &b->z );
-    field_sub ( &b->z, &a->z, &a->x );
-    field_mul  ( &b->y, &b->z, &a->y );
-    field_sub ( &b->z, &a->z, &a->y );
-    field_mul  ( &b->x, &b->z, &b->y );
-    field_mul  ( &b->t, &b->x, &b->u );
-    field_mul  ( &b->y, &b->x, &b->t );
-    field_isr  ( &b->t, &b->y );
-    field_mul  ( &b->u, &b->x, &b->t );
-    field_sqr  ( &b->x, &b->t );
-    field_mul  ( &b->t, &b->y, &b->x );
-    field_mul  ( &b->x, &a->x, &b->u );
-    field_mul  ( &b->y, &a->y, &b->u );
-    field_addw ( &b->y,    -field_is_zero( &b->z ) );
-    field_set_ui( &b->z,     1 );
-    field_copy ( &b->t, &b->x );
-    field_copy ( &b->u, &b->y );
+    field_sqr  ( b->y, a->z );
+    field_sqr  ( b->z, a->x );
+    field_sub ( b->u, b->y, b->z );
+    field_sub ( b->z, a->z, a->x );
+    field_mul  ( b->y, b->z, a->y );
+    field_sub ( b->z, a->z, a->y );
+    field_mul  ( b->x, b->z, b->y );
+    field_mul  ( b->t, b->x, b->u );
+    field_mul  ( b->y, b->x, b->t );
+    field_isr  ( b->t, b->y );
+    field_mul  ( b->u, b->x, b->t );
+    field_sqr  ( b->x, b->t );
+    field_mul  ( b->t, b->y, b->x );
+    field_mul  ( b->x, a->x, b->u );
+    field_mul  ( b->y, a->y, b->u );
+    field_addw ( b->y,    -field_is_zero( b->z ) );
+    field_set_ui( b->z,     1 );
+    field_copy ( b->t, b->x );
+    field_copy ( b->u, b->y );
 }
 
 void
@@ -404,135 +404,134 @@ test_only_twist (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    struct field_t L0, L1;
-    field_sqr  ( &b->u, &a->z );
-    field_sqr  ( &b->y, &a->x );
-    field_sub ( &b->z, &b->u, &b->y );
-    field_add ( &b->y, &b->z, &b->z );
-    field_add ( &b->u, &b->y, &b->y );
-    field_sub ( &b->y, &a->z, &a->x );
-    field_mul  ( &b->x, &b->y, &a->y );
-    field_sub ( &b->z, &a->z, &a->y );
-    field_mul  ( &b->t, &b->z, &b->x );
-    field_mul  (   &L1, &b->t, &b->u );
-    field_mul  ( &b->x, &b->t,   &L1 );
-    field_isr  (   &L0, &b->x );
-    field_mul  ( &b->u, &b->t,   &L0 );
-    field_sqr  (   &L1,   &L0 );
-    field_mul  ( &b->t, &b->x,   &L1 );
-    field_add (   &L1, &a->y, &a->x );
-    field_sub (   &L0, &a->x, &a->y );
-    field_mul  ( &b->x, &b->t,   &L0 );
-    field_add (   &L0, &b->x,   &L1 );
-    field_sub ( &b->t,   &L1, &b->x );
-    field_mul  ( &b->x,   &L0, &b->u );
-    field_addw ( &b->x, -field_is_zero( &b->y ) );
-    field_mul  ( &b->y, &b->t, &b->u );
-    field_addw ( &b->y, -field_is_zero( &b->z ) );
-    field_set_ui( &b->z, 1+field_is_zero( &a->y ) );
-    field_copy ( &b->t, &b->x );
-    field_copy ( &b->u, &b->y );
+    field_a_t L0, L1;
+    field_sqr  ( b->u, a->z );
+    field_sqr  ( b->y, a->x );
+    field_sub ( b->z, b->u, b->y );
+    field_add ( b->y, b->z, b->z );
+    field_add ( b->u, b->y, b->y );
+    field_sub ( b->y, a->z, a->x );
+    field_mul  ( b->x, b->y, a->y );
+    field_sub ( b->z, a->z, a->y );
+    field_mul  ( b->t, b->z, b->x );
+    field_mul  (   L1, b->t, b->u );
+    field_mul  ( b->x, b->t,   L1 );
+    field_isr  (   L0, b->x );
+    field_mul  ( b->u, b->t,   L0 );
+    field_sqr  (   L1,   L0 );
+    field_mul  ( b->t, b->x,   L1 );
+    field_add (   L1, a->y, a->x );
+    field_sub (   L0, a->x, a->y );
+    field_mul  ( b->x, b->t,   L0 );
+    field_add (   L0, b->x,   L1 );
+    field_sub ( b->t,   L1, b->x );
+    field_mul  ( b->x,   L0, b->u );
+    field_addw ( b->x, -field_is_zero( b->y ) );
+    field_mul  ( b->y, b->t, b->u );
+    field_addw ( b->y, -field_is_zero( b->z ) );
+    field_set_ui( b->z, 1+field_is_zero( a->y ) );
+    field_copy ( b->t, b->x );
+    field_copy ( b->u, b->y );
 }
 
 mask_t
 is_even_pt (
     const struct extensible_t* a
 ) {
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L2, &a->z );
-    field_sqr  (   &L1, &a->x );
-    field_sub (   &L0,   &L2,   &L1 );
-    return field_is_square (   &L0 );
+    field_a_t L0, L1, L2;
+    field_sqr  (   L2, a->z );
+    field_sqr  (   L1, a->x );
+    field_sub (   L0,   L2,   L1 );
+    return field_is_square (   L0 );
 }
 
 mask_t
 is_even_tw (
     const struct tw_extensible_t* a
 ) {
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L2, &a->z );
-    field_sqr  (   &L1, &a->x );
-    field_add (   &L0,   &L1,   &L2 );
-    return field_is_square (   &L0 );
+    field_a_t L0, L1, L2;
+    field_sqr  (   L2, a->z );
+    field_sqr  (   L1, a->x );
+    field_add (   L0,   L1,   L2 );
+    return field_is_square (   L0 );
 }
 
 mask_t
 deserialize_affine (
     struct affine_t*     a,
-    const struct field_t* sz
-) {
-    struct field_t L0, L1, L2, L3;
-    field_sqr  (   &L1,    sz );
-    field_copy (   &L3,   &L1 );
-    field_addw (   &L3,     1 );
-    field_sqr  (   &L2,   &L3 );
-    field_mulw_scc (   &a->x,   &L2, EDWARDS_D-1 ); /* PERF MULW */
-    field_add (   &L3,   &L1,   &L1 ); /* FIXME: i adjusted the bias here, was it right? */
-    field_add ( &a->y,   &L3,   &L3 );
-    field_add (   &L3, &a->y, &a->x );
-    field_copy ( &a->y,   &L1 );
-    field_negx ( &a->x, &a->y );
-    field_addw ( &a->x,     1 );
-    field_mul  ( &a->y, &a->x,   &L3 );
-    field_sqr  (   &L2, &a->x );
-    field_mul  (   &L0,   &L2, &a->y );
-    field_mul  ( &a->y, &a->x,   &L0 );
-    field_isr  (   &L3, &a->y );
-    field_mul  ( &a->y,   &L2,   &L3 );
-    field_sqr  (   &L2,   &L3 );
-    field_mul  (   &L3,   &L0,   &L2 );
-    field_mul  (   &L0, &a->x,   &L3 );
-    field_add (   &L2, &a->y, &a->y );
-    field_mul  ( &a->x,    sz,   &L2 );
-    field_addw (   &L1,     1 );
-    field_mul  ( &a->y,   &L1,   &L3 );
-    field_subw(   &L0,     1 );
-    return field_is_zero(   &L0 );
+    const field_a_t sz
+) {
+    field_a_t L0, L1, L2, L3;
+    field_sqr  (   L1,    sz );
+    field_copy (   L3,   L1 );
+    field_addw (   L3,     1 );
+    field_sqr  (   L2,   L3 );
+    field_mulw_scc (   a->x,   L2, EDWARDS_D-1 ); /* PERF MULW */
+    field_add (   L3,   L1,   L1 ); /* FIXME: i adjusted the bias here, was it right? */
+    field_add ( a->y,   L3,   L3 );
+    field_add (   L3, a->y, a->x );
+    field_copy ( a->y,   L1 );
+    field_neg ( a->x, a->y );
+    field_addw ( a->x,     1 );
+    field_mul  ( a->y, a->x,   L3 );
+    field_sqr  (   L2, a->x );
+    field_mul  (   L0,   L2, a->y );
+    field_mul  ( a->y, a->x,   L0 );
+    field_isr  (   L3, a->y );
+    field_mul  ( a->y,   L2,   L3 );
+    field_sqr  (   L2,   L3 );
+    field_mul  (   L3,   L0,   L2 );
+    field_mul  (   L0, a->x,   L3 );
+    field_add (   L2, a->y, a->y );
+    field_mul  ( a->x,    sz,   L2 );
+    field_addw (   L1,     1 );
+    field_mul  ( a->y,   L1,   L3 );
+    field_subw(   L0,     1 );
+    return field_is_zero(   L0 );
 }
 
 mask_t
 deserialize_and_twist_approx (
     struct tw_extensible_t* a,
-    const struct field_t*    sdm1,
-    const struct field_t*    sz
-) {
-    struct field_t L0, L1;
-    field_sqr  ( &a->z,    sz );
-    field_copy ( &a->y, &a->z );
-    field_addw ( &a->y,     1 );
-    field_sqr  ( &L0, &a->y );
-    field_mulw_scc ( &a->x, &L0, EDWARDS_D-1 );
-    field_add ( &a->y, &a->z, &a->z );
-    field_add ( &a->u, &a->y, &a->y );
-    field_add ( &a->y, &a->u, &a->x );
-    field_sqr  ( &a->x, &a->z );
-    field_negx ( &a->u, &a->x );
-    field_addw ( &a->u,     1 );
-    field_mul  ( &a->x,  sdm1, &a->u );
-    field_mul  (   &L0, &a->x, &a->y );
-    field_mul  ( &a->t,   &L0, &a->y );
-    field_mul  ( &a->u, &a->x, &a->t );
-    field_mul  ( &a->t, &a->u,   &L0 );
-    field_mul  ( &a->y, &a->x, &a->t );
-    field_isr  (   &L0, &a->y );
-    field_mul  ( &a->y, &a->u,   &L0 );
-    field_sqr  (   &L1,   &L0 );
-    field_mul  ( &a->u, &a->t,   &L1 );
-    field_mul  ( &a->t, &a->x, &a->u );
-    field_add ( &a->x,    sz,    sz );
-    field_mul  (   &L0, &a->u, &a->x );
-    field_copy ( &a->x, &a->z );
-    field_negx (   &L1, &a->x );
-    field_addw (   &L1,     1 );
-    field_mul  ( &a->x,   &L1,   &L0 );
-    field_mul  (   &L0, &a->u, &a->y );
-    field_addw ( &a->z,     1 );
-    field_mul  ( &a->y, &a->z,   &L0 );
-    field_subw( &a->t,     1 );
-    mask_t ret = field_is_zero( &a->t );
-    field_set_ui( &a->z,     1 );
-    field_copy ( &a->t, &a->x );
-    field_copy ( &a->u, &a->y );
+    const field_a_t    sz
+) {
+    field_a_t L0, L1;
+    field_sqr  ( a->z,    sz );
+    field_copy ( a->y, a->z );
+    field_addw ( a->y,     1 );
+    field_sqr  ( L0, a->y );
+    field_mulw_scc ( a->x, L0, EDWARDS_D-1 );
+    field_add ( a->y, a->z, a->z );
+    field_add ( a->u, a->y, a->y );
+    field_add ( a->y, a->u, a->x );
+    field_sqr  ( a->x, a->z );
+    field_neg ( a->u, a->x );
+    field_addw ( a->u,     1 );
+    field_mul  ( a->x,  sqrt_d_minus_1, a->u );
+    field_mul  (   L0, a->x, a->y );
+    field_mul  ( a->t,   L0, a->y );
+    field_mul  ( a->u, a->x, a->t );
+    field_mul  ( a->t, a->u,   L0 );
+    field_mul  ( a->y, a->x, a->t );
+    field_isr  (   L0, a->y );
+    field_mul  ( a->y, a->u,   L0 );
+    field_sqr  (   L1,   L0 );
+    field_mul  ( a->u, a->t,   L1 );
+    field_mul  ( a->t, a->x, a->u );
+    field_add ( a->x,    sz,    sz );
+    field_mul  (   L0, a->u, a->x );
+    field_copy ( a->x, a->z );
+    field_neg (   L1, a->x );
+    field_addw (   L1,     1 );
+    field_mul  ( a->x,   L1,   L0 );
+    field_mul  (   L0, a->u, a->y );
+    field_addw ( a->z,     1 );
+    field_mul  ( a->y, a->z,   L0 );
+    field_subw( a->t,     1 );
+    mask_t ret = field_is_zero( a->t );
+    field_set_ui( a->z,     1 );
+    field_copy ( a->t, a->x );
+    field_copy ( a->u, a->y );
     return ret;
 }
 
@@ -540,30 +539,30 @@ void
 set_identity_extensible (
     struct extensible_t* a
 ) {
-    field_set_ui( &a->x,     0 );
-    field_set_ui( &a->y,     1 );
-    field_set_ui( &a->z,     1 );
-    field_set_ui( &a->t,     0 );
-    field_set_ui( &a->u,     0 );
+    field_set_ui( a->x,     0 );
+    field_set_ui( a->y,     1 );
+    field_set_ui( a->z,     1 );
+    field_set_ui( a->t,     0 );
+    field_set_ui( a->u,     0 );
 }
 
 void
 set_identity_tw_extensible (
     struct tw_extensible_t* a
 ) {
-    field_set_ui( &a->x,     0 );
-    field_set_ui( &a->y,     1 );
-    field_set_ui( &a->z,     1 );
-    field_set_ui( &a->t,     0 );
-    field_set_ui( &a->u,     0 );
+    field_set_ui( a->x,     0 );
+    field_set_ui( a->y,     1 );
+    field_set_ui( a->z,     1 );
+    field_set_ui( a->t,     0 );
+    field_set_ui( a->u,     0 );
 }
 
 void
 set_identity_affine (
     struct affine_t* a
 ) {
-    field_set_ui( &a->x,     0 );
-    field_set_ui( &a->y,     1 );
+    field_set_ui( a->x,     0 );
+    field_set_ui( a->y,     1 );
 }
 
 mask_t
@@ -572,11 +571,11 @@ eq_affine (
     const struct affine_t* b
 ) {
     mask_t L1, L2;
-    struct field_t L0;
-    field_sub (   &L0, &a->x, &b->x );
-       L2 = field_is_zero(   &L0 );
-    field_sub (   &L0, &a->y, &b->y );
-       L1 = field_is_zero(   &L0 );
+    field_a_t L0;
+    field_sub (   L0, a->x, b->x );
+       L2 = field_is_zero(   L0 );
+    field_sub (   L0, a->y, b->y );
+       L1 = field_is_zero(   L0 );
     return    L2 &    L1;
 }
 
@@ -586,15 +585,15 @@ eq_extensible (
     const struct extensible_t* b
 ) {
     mask_t L3, L4;
-    struct field_t L0, L1, L2;
-    field_mul  (   &L2, &b->z, &a->x );
-    field_mul  (   &L1, &a->z, &b->x );
-    field_sub (   &L0,   &L2,   &L1 );
-       L4 = field_is_zero(   &L0 );
-    field_mul  (   &L2, &b->z, &a->y );
-    field_mul  (   &L1, &a->z, &b->y );
-    field_sub (   &L0,   &L2,   &L1 );
-       L3 = field_is_zero(   &L0 );
+    field_a_t L0, L1, L2;
+    field_mul  (   L2, b->z, a->x );
+    field_mul  (   L1, a->z, b->x );
+    field_sub (   L0,   L2,   L1 );
+       L4 = field_is_zero(   L0 );
+    field_mul  (   L2, b->z, a->y );
+    field_mul  (   L1, a->z, b->y );
+    field_sub (   L0,   L2,   L1 );
+       L3 = field_is_zero(   L0 );
     return    L4 &    L3;
 }
 
@@ -604,39 +603,39 @@ eq_tw_extensible (
     const struct tw_extensible_t* b
 ) {
     mask_t L3, L4;
-    struct field_t L0, L1, L2;
-    field_mul  (   &L2, &b->z, &a->x );
-    field_mul  (   &L1, &a->z, &b->x );
-    field_sub (   &L0,   &L2,   &L1 );
-       L4 = field_is_zero(   &L0 );
-    field_mul  (   &L2, &b->z, &a->y );
-    field_mul  (   &L1, &a->z, &b->y );
-    field_sub (   &L0,   &L2,   &L1 );
-       L3 = field_is_zero(   &L0 );
+    field_a_t L0, L1, L2;
+    field_mul  (   L2, b->z, a->x );
+    field_mul  (   L1, a->z, b->x );
+    field_sub (   L0,   L2,   L1 );
+       L4 = field_is_zero(   L0 );
+    field_mul  (   L2, b->z, a->y );
+    field_mul  (   L1, a->z, b->y );
+    field_sub (   L0,   L2,   L1 );
+       L3 = field_is_zero(   L0 );
     return    L4 &    L3;
 }
 
 void
 elligator_2s_inject (
     struct affine_t*     a,
-    const struct field_t* r
-) {
-    struct field_t L2, L3, L4, L5, L6, L7, L8;
-    field_sqr  ( &a->x,     r );
-    field_sqr  (   &L3, &a->x );
-    field_copy ( &a->y,   &L3 );
-    field_negx (   &L4, &a->y );
-    field_addw (   &L4,     1 );
-    field_sqr  (   &L2,   &L4 );
-    field_mulw (   &L7,   &L2, (EDWARDS_D-1)*(EDWARDS_D-1) );
-    field_mulw (   &L8,   &L3, 4*(EDWARDS_D+1)*(EDWARDS_D+1) );
-    field_add ( &a->y,   &L8,   &L7 );
-    field_mulw (   &L8,   &L2, 4*(EDWARDS_D)*(EDWARDS_D-1) );
-    field_sub (   &L7, &a->y,   &L8 );
-    field_mulw_scc (   &L6, &a->y, -2-2*EDWARDS_D );
-    field_mul  (   &L5,   &L7,   &L6 );
+    const field_a_t r
+) {
+    field_a_t L2, L3, L4, L5, L6, L7, L8;
+    field_sqr  ( a->x,     r );
+    field_sqr  (   L3, a->x );
+    field_copy ( a->y,   L3 );
+    field_neg (   L4, a->y );
+    field_addw (   L4,     1 );
+    field_sqr  (   L2,   L4 );
+    field_mulw (   L7,   L2, (EDWARDS_D-1)*(EDWARDS_D-1) );
+    field_mulw (   L8,   L3, 4*(EDWARDS_D+1)*(EDWARDS_D+1) );
+    field_add ( a->y,   L8,   L7 );
+    field_mulw (   L8,   L2, 4*(EDWARDS_D)*(EDWARDS_D-1) );
+    field_sub (   L7, a->y,   L8 );
+    field_mulw_scc (   L6, a->y, -2-2*EDWARDS_D );
+    field_mul  (   L5,   L7,   L6 );
         /* FIXME Stability problem (API stability, not crash) / possible bug.
-         * change to: p448_mul  (   &L5,   &L7,   &L4 ); ?
+         * change to: p448_mul  (   L5,   L7,   L4 ); ?
          * This isn't a deep change: it's for sign adjustment.
          * Need to check which one leads to the correct sign, probably by writig
          * the invert routine.
@@ -647,47 +646,47 @@ elligator_2s_inject (
          * Could compute be, (be)^2, (be)^3, a b^3 e^3, a b^3 e^4. = 4M+S
          * instead of 6M.
          */
-    field_mul  (   &L8,   &L5,   &L4 );
-    field_mul  (   &L4,   &L5,   &L6 );
-    field_mul  (   &L5,   &L7,   &L8 );
-    field_mul  (   &L8,   &L5,   &L4 );
-    field_mul  (   &L4,   &L7,   &L8 );
-    field_isr  (   &L6,   &L4 );
-    field_mul  (   &L4,   &L5,   &L6 );
-    field_sqr  (   &L5,   &L6 );
-    field_mul  (   &L6,   &L8,   &L5 );
-    field_mul  (   &L8,   &L7,   &L6 );
-    field_mul  (   &L7,   &L8,   &L6 );
-    field_copy (   &L6, &a->x );
-    field_addw ( &a->x,     1 );
-    field_mul  (   &L5, &a->x,   &L8 );
-    field_addw (   &L5,     1 );
-    field_sub ( &a->x,   &L6,   &L5 );
-    field_mul  (   &L5,   &L4, &a->x );
-    field_mulw_scc_wr (   &a->x,   &L5, -2-2*EDWARDS_D );
-    field_add (   &L4,   &L3,   &L3 );
-    field_add (   &L3,   &L4,   &L2 );
-    field_subw(   &L3,     2 );
-    field_mul  (   &L2,   &L3,   &L8 );
-    field_mulw (   &L3,   &L2, 2*(EDWARDS_D+1)*(EDWARDS_D-1) );
-    field_add (   &L2,   &L3, &a->y );
-    field_mul  ( &a->y,   &L7,   &L2 );
-    field_addw ( &a->y,    -field_is_zero( &L8 ) );
+    field_mul  (   L8,   L5,   L4 );
+    field_mul  (   L4,   L5,   L6 );
+    field_mul  (   L5,   L7,   L8 );
+    field_mul  (   L8,   L5,   L4 );
+    field_mul  (   L4,   L7,   L8 );
+    field_isr  (   L6,   L4 );
+    field_mul  (   L4,   L5,   L6 );
+    field_sqr  (   L5,   L6 );
+    field_mul  (   L6,   L8,   L5 );
+    field_mul  (   L8,   L7,   L6 );
+    field_mul  (   L7,   L8,   L6 );
+    field_copy (   L6, a->x );
+    field_addw ( a->x,     1 );
+    field_mul  (   L5, a->x,   L8 );
+    field_addw (   L5,     1 );
+    field_sub ( a->x,   L6,   L5 );
+    field_mul  (   L5,   L4, a->x );
+    field_mulw_scc_wr (   a->x,   L5, -2-2*EDWARDS_D );
+    field_add (   L4,   L3,   L3 );
+    field_add (   L3,   L4,   L2 );
+    field_subw(   L3,     2 );
+    field_mul  (   L2,   L3,   L8 );
+    field_mulw (   L3,   L2, 2*(EDWARDS_D+1)*(EDWARDS_D-1) );
+    field_add (   L2,   L3, a->y );
+    field_mul  ( a->y,   L7,   L2 );
+    field_addw ( a->y,    -field_is_zero( L8 ) );
 }
 
 mask_t
 validate_affine (
     const struct affine_t* a
 ) {
-    struct field_t L0, L1, L2, L3;
-    field_sqr  (   &L0, &a->y );
-    field_sqr  (   &L1, &a->x );
-    field_add (   &L3,   &L1,   &L0 );
-    field_mulw_scc (   &L2,   &L1, EDWARDS_D );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_addw (   &L1,     1 );
-    field_sub (   &L0,   &L3,   &L1 );
-    return field_is_zero(   &L0 );
+    field_a_t L0, L1, L2, L3;
+    field_sqr  (   L0, a->y );
+    field_sqr  (   L1, a->x );
+    field_add (   L3,   L1,   L0 );
+    field_mulw_scc (   L2,   L1, EDWARDS_D );
+    field_mul  (   L1,   L0,   L2 );
+    field_addw (   L1,     1 );
+    field_sub (   L0,   L3,   L1 );
+    return field_is_zero(   L0 );
 }
 
 mask_t
@@ -695,36 +694,36 @@ validate_tw_extensible (
     const struct tw_extensible_t* ext
 ) {
     mask_t L4, L5;
-    struct field_t L0, L1, L2, L3;
+    field_a_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    field_mul  (   &L1, &ext->t, &ext->u );
-    field_mul  (   &L2, &ext->z,   &L1 );
-    field_mul  (   &L0, &ext->x, &ext->y );
-    field_negx (   &L1,   &L0 );
-    field_add (   &L0,   &L1,   &L2 );
-       L5 = field_is_zero(   &L0 );
+    field_mul  (   L1, ext->t, ext->u );
+    field_mul  (   L2, ext->z,   L1 );
+    field_mul  (   L0, ext->x, ext->y );
+    field_neg (   L1,   L0 );
+    field_add (   L0,   L1,   L2 );
+       L5 = field_is_zero(   L0 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
-    field_sqr  (   &L2, &ext->y );
-    field_negx (   &L1,   &L2 );
-    field_sqr  (   &L0, &ext->x );
-    field_add (   &L2,   &L0,   &L1 );
-    field_sqr  (   &L3, &ext->u );
-    field_sqr  (   &L0, &ext->t );
-    field_mul  (   &L1,   &L0,   &L3 );
-    field_mulw_scc (   &L3,   &L1, EDWARDS_D );
-    field_add (   &L0,   &L3,   &L2 );
-    field_negx (   &L3,   &L1 );
-    field_add (   &L2,   &L3,   &L0 );
-    field_sqr  (   &L1, &ext->z );
-    field_add (   &L0,   &L1,   &L2 );
-       L4 = field_is_zero(   &L0 );
-    return    L5 & L4 &~ field_is_zero(&ext->z);
+    field_sqr  (   L2, ext->y );
+    field_neg (   L1,   L2 );
+    field_sqr  (   L0, ext->x );
+    field_add (   L2,   L0,   L1 );
+    field_sqr  (   L3, ext->u );
+    field_sqr  (   L0, ext->t );
+    field_mul  (   L1,   L0,   L3 );
+    field_mulw_scc (   L3,   L1, EDWARDS_D );
+    field_add (   L0,   L3,   L2 );
+    field_neg (   L3,   L1 );
+    field_add (   L2,   L3,   L0 );
+    field_sqr  (   L1, ext->z );
+    field_add (   L0,   L1,   L2 );
+       L4 = field_is_zero(   L0 );
+    return    L5 & L4 &~ field_is_zero(ext->z);
 }
 
 mask_t
@@ -732,33 +731,33 @@ validate_extensible (
     const struct extensible_t* ext
 ) {
     mask_t L4, L5;
-    struct field_t L0, L1, L2, L3;
+    field_a_t L0, L1, L2, L3;
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
-    field_sqr  (   &L2, &ext->y );
-    field_negx (   &L1,   &L2 );
-    field_sqr  (   &L0, &ext->z );
-    field_add (   &L2,   &L0,   &L1 );
-    field_sqr  (   &L3, &ext->u );
-    field_sqr  (   &L0, &ext->t );
-    field_mul  (   &L1,   &L0,   &L3 );
-    field_mulw_scc (   &L0,   &L1, EDWARDS_D );
-    field_add (   &L1,   &L0,   &L2 );
-    field_sqr  (   &L0, &ext->x );
-    field_negx (   &L2,   &L0 );
-    field_add (   &L0,   &L2,   &L1 );
-       L5 = field_is_zero(   &L0 );
+    field_sqr  (   L2, ext->y );
+    field_neg (   L1,   L2 );
+    field_sqr  (   L0, ext->z );
+    field_add (   L2,   L0,   L1 );
+    field_sqr  (   L3, ext->u );
+    field_sqr  (   L0, ext->t );
+    field_mul  (   L1,   L0,   L3 );
+    field_mulw_scc (   L0,   L1, EDWARDS_D );
+    field_add (   L1,   L0,   L2 );
+    field_sqr  (   L0, ext->x );
+    field_neg (   L2,   L0 );
+    field_add (   L0,   L2,   L1 );
+       L5 = field_is_zero(   L0 );
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    field_mul  (   &L1, &ext->t, &ext->u );
-    field_mul  (   &L2, &ext->z,   &L1 );
-    field_mul  (   &L0, &ext->x, &ext->y );
-    field_negx (   &L1,   &L0 );
-    field_add (   &L0,   &L1,   &L2 );
-       L4 = field_is_zero(   &L0 );
-    return L5 & L4 &~ field_is_zero(&ext->z);
+    field_mul  (   L1, ext->t, ext->u );
+    field_mul  (   L2, ext->z,   L1 );
+    field_mul  (   L0, ext->x, ext->y );
+    field_neg (   L1,   L0 );
+    field_add (   L0,   L1,   L2 );
+       L4 = field_is_zero(   L0 );
+    return L5 & L4 &~ field_is_zero(ext->z);
 }
diff --git a/src/goldilocks.c b/src/goldilocks.c
index f86e1ab..7cba9c4 100644
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -162,7 +162,7 @@ goldilocks_derive_private_key (
     
     struct sha512_ctx_t ctx;
     struct tw_extensible_t exta;
-    struct field_t pk;
+    field_a_t pk;
     
     sha512_init(&ctx);
     sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
@@ -173,9 +173,9 @@ goldilocks_derive_private_key (
     barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);
 
     scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
-    untwist_and_double_and_serialize(&pk, &exta);
+    untwist_and_double_and_serialize(pk, &exta);
     
-    field_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], &pk);
+    field_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], pk);
     
     return GOLDI_EOK;
 }
@@ -225,11 +225,11 @@ goldilocks_private_to_public (
     struct goldilocks_public_key_t *pubkey,
     const struct goldilocks_private_key_t *privkey
 ) {
-    struct field_t pk;
-    mask_t msucc = field_deserialize(&pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
+    field_a_t pk;
+    mask_t msucc = field_deserialize(pk,&privkey->opaque[GOLDI_FIELD_BYTES]);
     
     if (msucc) {
-        field_serialize(pubkey->opaque, &pk);
+        field_serialize(pubkey->opaque, pk);
         return GOLDI_EOK;
     } else {
         return GOLDI_ECORRUPT;
@@ -252,15 +252,15 @@ goldilocks_shared_secret_core (
     assert(GOLDI_SHARED_SECRET_BYTES == SHA512_OUTPUT_BYTES);
     
     word_t sk[GOLDI_FIELD_WORDS];
-    struct field_t pk;
+    field_a_t pk;
     
-    mask_t succ = field_deserialize(&pk,your_pubkey->opaque), msucc = -1;
+    mask_t succ = field_deserialize(pk,your_pubkey->opaque), msucc = -1;
     
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
-    struct field_t sum, prod;
-    msucc &= field_deserialize(&sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
-    field_mul(&prod,&pk,&sum);
-    field_add(&sum,&pk,&sum);
+    field_a_t sum, prod;
+    msucc &= field_deserialize(sum,&my_privkey->opaque[GOLDI_FIELD_BYTES]);
+    field_mul(prod,pk,sum);
+    field_add(sum,pk,sum);
 #endif
     
     msucc &= barrett_deserialize(sk,my_privkey->opaque,&curve_prime_order);
@@ -269,17 +269,17 @@ goldilocks_shared_secret_core (
     if (pre) {
         struct tw_extensible_t tw;
         succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table);
-        untwist_and_double_and_serialize(&pk, &tw);
+        untwist_and_double_and_serialize(pk, &tw);
     } else {
-        succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
+        succ &= montgomery_ladder(pk,pk,sk,GOLDI_SCALAR_BITS,1);
     }
 #else
     (void)pre;
-    succ &= montgomery_ladder(&pk,&pk,sk,GOLDI_SCALAR_BITS,1);
+    succ &= montgomery_ladder(pk,pk,sk,GOLDI_SCALAR_BITS,1);
 #endif
     
     
-    field_serialize(gxy,&pk);
+    field_serialize(gxy,pk);
     
     /* obliterate records of our failure by adjusting with obliteration key */
     struct sha512_ctx_t ctx;
@@ -300,9 +300,9 @@ goldilocks_shared_secret_core (
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
     /* stir in the sum and product of the pubkeys. */
     uint8_t a_pk[GOLDI_FIELD_BYTES];
-    field_serialize(a_pk, &sum);
+    field_serialize(a_pk, sum);
     sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
-    field_serialize(a_pk, &prod);
+    field_serialize(a_pk, prod);
     sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
 #endif
        
@@ -383,11 +383,11 @@ goldilocks_sign (
     /* 4[nonce]G */
     uint8_t signature_tmp[GOLDI_FIELD_BYTES];
     struct tw_extensible_t exta;
-    struct field_t gsk;
+    field_a_t gsk;
     scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
     double_tw_extensible(&exta);
-    untwist_and_double_and_serialize(&gsk, &exta);
-    field_serialize(signature_tmp, &gsk);
+    untwist_and_double_and_serialize(gsk, &exta);
+    field_serialize(signature_tmp, gsk);
     
     word_t challenge[GOLDI_FIELD_WORDS];
     goldilocks_derive_challenge (
@@ -437,10 +437,10 @@ goldilocks_verify (
         return GOLDI_EUNINIT;
     }
     
-    struct field_t pk;
+    field_a_t pk;
     word_t s[GOLDI_FIELD_WORDS];
     
-    mask_t succ = field_deserialize(&pk,pubkey->opaque);
+    mask_t succ = field_deserialize(pk,pubkey->opaque);
     if (!succ) return GOLDI_EINVAL;
     
     succ = barrett_deserialize(s, &signature[GOLDI_FIELD_BYTES], &curve_prime_order);
@@ -449,14 +449,14 @@ goldilocks_verify (
     word_t challenge[GOLDI_FIELD_WORDS];
     goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
     
-    struct field_t eph;
+    field_a_t eph;
     struct tw_extensible_t pk_text;
     
     /* deserialize [nonce]G */
-    succ = field_deserialize(&eph, signature);
+    succ = field_deserialize(eph, signature);
     if (!succ) return GOLDI_EINVAL;
     
-    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
+    succ = deserialize_and_twist_approx(&pk_text, pk);
     if (!succ) return GOLDI_EINVAL;
     
     linear_combo_var_fixed_vt( &pk_text,
@@ -464,9 +464,9 @@ goldilocks_verify (
         s, GOLDI_SCALAR_BITS,
         goldilocks_global.wnafs, WNAF_PRECMP_BITS );
     
-    untwist_and_double_and_serialize( &pk, &pk_text );
+    untwist_and_double_and_serialize( pk, &pk_text );
 
-    succ = field_eq(&eph, &pk);
+    succ = field_eq(eph, pk);
     return succ ? 0 : GOLDI_EINVAL;
 }
 #endif
@@ -485,14 +485,14 @@ goldilocks_precompute_public_key (
     
     struct tw_extensible_t pk_text;
     
-    struct field_t pk;
-    mask_t succ = field_deserialize(&pk, pub->opaque);
+    field_a_t pk;
+    mask_t succ = field_deserialize(pk, pub->opaque);
     if (!succ) {
         free(precom);
         return NULL;
     }
     
-    succ = deserialize_and_twist_approx(&pk_text, &sqrt_d_minus_1, &pk);
+    succ = deserialize_and_twist_approx(&pk_text, pk);
     if (!succ) {
         free(precom);
         return NULL;
@@ -538,11 +538,11 @@ goldilocks_verify_precomputed (
     word_t challenge[GOLDI_FIELD_WORDS];
     goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
     
-    struct field_t eph, pk;
+    field_a_t eph, pk;
     struct tw_extensible_t pk_text;
     
     /* deserialize [nonce]G */
-    succ = field_deserialize(&eph, signature);
+    succ = field_deserialize(eph, signature);
     if (!succ) return GOLDI_EINVAL;
         
     succ = linear_combo_combs_vt (
@@ -552,9 +552,9 @@ goldilocks_verify_precomputed (
     );
     if (!succ) return GOLDI_EINVAL;
     
-    untwist_and_double_and_serialize( &pk, &pk_text );
+    untwist_and_double_and_serialize( pk, &pk_text );
 
-    succ = field_eq(&eph, &pk);
+    succ = field_eq(eph, pk);
     return succ ? 0 : GOLDI_EINVAL;
 }
 
diff --git a/src/include/ec_point.h b/src/include/ec_point.h
index 74bbe91..9d0f4f3 100644
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -21,21 +21,21 @@ extern "C" {
  * Affine point on an Edwards curve.
  */
 struct affine_t {
-    struct field_t x, y;
+    field_a_t x, y;
 };
 
 /**
  * Affine point on a twisted Edwards curve.
  */
 struct tw_affine_t {
-    struct field_t x, y;
+    field_a_t x, y;
 };
 
 /**
  * Montgomery buffer.
  */
 struct montgomery_t {
-    struct field_t z0, xd, zd, xa, za;
+    field_a_t z0, xd, zd, xa, za;
 };
 
 /**
@@ -57,7 +57,7 @@ struct montgomery_t {
  * instead.
  */
 struct extensible_t {
-    struct field_t x, y, z, t, u;
+    field_a_t x, y, z, t, u;
 };
 
 /**
@@ -65,7 +65,7 @@ struct extensible_t {
  * suitable for accumulators.
  */
 struct tw_extensible_t {
-    struct field_t x, y, z, t, u;
+    field_a_t x, y, z, t, u;
 };
 
 /**
@@ -74,7 +74,7 @@ struct tw_extensible_t {
  * Good for mixed readdition; suitable for fixed tables.
  */
 struct tw_niels_t {
-    struct field_t a, b, c;
+    field_a_t a, b, c;
 };
 
 /**
@@ -84,7 +84,7 @@ struct tw_niels_t {
  */
 struct tw_pniels_t {
     struct tw_niels_t n;
-    struct field_t z;
+    field_a_t z;
 };
 
 
@@ -273,14 +273,14 @@ montgomery_step (
 void
 deserialize_montgomery (
     struct montgomery_t* a,
-    const struct field_t* sbz
+    const field_a_t sbz
 );
 
 mask_t
 serialize_montgomery (
-    struct field_t*             b,
+    field_a_t             b,
     const struct montgomery_t* a,
-    const struct field_t*       sbz
+    const field_a_t       sbz
 );
 
 /**
@@ -296,7 +296,7 @@ serialize_montgomery (
  */
 void
 serialize_extensible (
-    struct field_t*             b,
+    field_a_t             b,
     const struct extensible_t* a
 );
 
@@ -305,7 +305,7 @@ serialize_extensible (
  */
 void
 untwist_and_double_and_serialize (
-    struct field_t*                b,
+    field_a_t                b,
     const struct tw_extensible_t* a
 );
 
@@ -345,7 +345,7 @@ test_only_twist (
 
 mask_t
 field_is_square (
-    const struct field_t* x
+    const field_a_t x
 );
 
 mask_t
@@ -364,7 +364,7 @@ is_even_tw (
 mask_t
 deserialize_affine (
     struct affine_t*     a,
-    const struct field_t* sz
+    const field_a_t sz
 );
 
 /**
@@ -377,8 +377,7 @@ deserialize_affine (
 mask_t
 deserialize_and_twist_approx (
     struct tw_extensible_t* a,
-    const struct field_t*    sdm1,
-    const struct field_t*    sz
+    const field_a_t    sz
 );
 
 void
@@ -417,7 +416,7 @@ eq_tw_extensible (
 void
 elligator_2s_inject (
     struct affine_t*     a,
-    const struct field_t* r
+    const field_a_t r
 );
 
 mask_t
@@ -454,8 +453,8 @@ cond_negate_tw_niels (
     struct tw_niels_t *n,
     mask_t doNegate
 ) {
-    constant_time_cond_swap(&n->a, &n->b, sizeof(n->a), doNegate);
-    field_cond_neg(&n->c, doNegate);
+    constant_time_cond_swap(n->a, n->b, sizeof(n->a), doNegate);
+    field_cond_neg(n->c, doNegate);
 }
 
 /**
@@ -475,8 +474,8 @@ copy_affine (
     struct affine_t*       a,
     const struct affine_t* ds
 ) {
-    field_copy ( &a->x, &ds->x );
-    field_copy ( &a->y, &ds->y );
+    field_copy ( a->x, ds->x );
+    field_copy ( a->y, ds->y );
 }
 
 void
@@ -484,8 +483,8 @@ copy_tw_affine (
     struct tw_affine_t*       a,
     const struct tw_affine_t* ds
 ) {
-    field_copy ( &a->x, &ds->x );
-    field_copy ( &a->y, &ds->y );
+    field_copy ( a->x, ds->x );
+    field_copy ( a->y, ds->y );
 }
 
 void
@@ -493,11 +492,11 @@ copy_montgomery (
     struct montgomery_t*       a,
     const struct montgomery_t* ds
 ) {
-    field_copy ( &a->z0, &ds->z0 );
-    field_copy ( &a->xd, &ds->xd );
-    field_copy ( &a->zd, &ds->zd );
-    field_copy ( &a->xa, &ds->xa );
-    field_copy ( &a->za, &ds->za );
+    field_copy ( a->z0, ds->z0 );
+    field_copy ( a->xd, ds->xd );
+    field_copy ( a->zd, ds->zd );
+    field_copy ( a->xa, ds->xa );
+    field_copy ( a->za, ds->za );
 }
 
 void
@@ -505,11 +504,11 @@ copy_extensible (
     struct extensible_t*       a,
     const struct extensible_t* ds
 ) {
-    field_copy ( &a->x, &ds->x );
-    field_copy ( &a->y, &ds->y );
-    field_copy ( &a->z, &ds->z );
-    field_copy ( &a->t, &ds->t );
-    field_copy ( &a->u, &ds->u );
+    field_copy ( a->x, ds->x );
+    field_copy ( a->y, ds->y );
+    field_copy ( a->z, ds->z );
+    field_copy ( a->t, ds->t );
+    field_copy ( a->u, ds->u );
 }
 
 void
@@ -517,11 +516,11 @@ copy_tw_extensible (
     struct tw_extensible_t*       a,
     const struct tw_extensible_t* ds
 ) {
-    field_copy ( &a->x, &ds->x );
-    field_copy ( &a->y, &ds->y );
-    field_copy ( &a->z, &ds->z );
-    field_copy ( &a->t, &ds->t );
-    field_copy ( &a->u, &ds->u );
+    field_copy ( a->x, ds->x );
+    field_copy ( a->y, ds->y );
+    field_copy ( a->z, ds->z );
+    field_copy ( a->t, ds->t );
+    field_copy ( a->u, ds->u );
 }
 
 void
@@ -529,9 +528,9 @@ copy_tw_niels (
     struct tw_niels_t*       a,
     const struct tw_niels_t* ds
 ) {
-    field_copy ( &a->a, &ds->a );
-    field_copy ( &a->b, &ds->b );
-    field_copy ( &a->c, &ds->c );
+    field_copy ( a->a, ds->a );
+    field_copy ( a->b, ds->b );
+    field_copy ( a->c, ds->c );
 }
 
 void
@@ -540,7 +539,7 @@ copy_tw_pniels (
     const struct tw_pniels_t* ds
 ) {
     copy_tw_niels( &a->n, &ds->n );
-    field_copy ( &a->z, &ds->z );
+    field_copy ( a->z, ds->z );
 }
 
 #ifdef __cplusplus
diff --git a/src/include/field.h b/src/include/field.h
index d375c09..80e9b6f 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -14,6 +14,9 @@
 #include "f_field.h"
 #include <string.h>
 
+typedef struct field_t field_a_t[1];
+#define field_a_restrict_t struct field_t *__restrict__
+
 #define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
 #if (is32)
 #define IF32(s) (s)
@@ -54,8 +57,8 @@ extern const uint8_t FIELD_MODULUS[FIELD_BYTES];
 static inline void
 __attribute__((unused,always_inline))        
 field_copy (
-    struct field_t *__restrict__ a,
-    const struct field_t *__restrict__ b
+    field_a_restrict_t a,
+    const field_a_restrict_t b
 ) {
     memcpy(a,b,sizeof(*a));
 }
@@ -70,8 +73,8 @@ field_copy (
  */
 void
 field_isr (
-    struct field_t*       a,
-    const struct field_t* x
+    field_a_t       a,
+    const field_a_t x
 );
     
 /**
@@ -81,8 +84,8 @@ field_isr (
  */     
 void
 field_simultaneous_invert (
-    struct field_t *__restrict__ out,
-    const struct field_t *in,
+    field_a_t *__restrict__ out,
+    const field_a_t *in,
     unsigned int n
 );
 
@@ -93,8 +96,8 @@ field_simultaneous_invert (
  */
 void
 field_inverse (
-    struct field_t*       a,
-    const struct field_t* x
+    field_a_t       a,
+    const field_a_t x
 );
 
 /**
@@ -102,8 +105,8 @@ field_inverse (
  */
 mask_t
 field_eq (
-    const struct field_t *a,
-    const struct field_t *b
+    const field_a_t a,
+    const field_a_t b
 );
     
 /**
@@ -112,31 +115,31 @@ field_eq (
 static __inline__ void
 __attribute__((unused,always_inline))
 field_sqrn (
-    field_t *__restrict__ y,
-    const field_t *x,
+    field_a_restrict_t y,
+    const field_a_t x,
     int n
 ) {
-    field_t tmp;
+    field_a_t tmp;
     assert(n>0);
     if (n&1) {
         field_sqr(y,x);
         n--;
     } else {
-        field_sqr(&tmp,x);
-        field_sqr(y,&tmp);
+        field_sqr(tmp,x);
+        field_sqr(y,tmp);
         n-=2;
     }
     for (; n; n-=2) {
-        field_sqr(&tmp,y);
-        field_sqr(y,&tmp);
+        field_sqr(tmp,y);
+        field_sqr(y,tmp);
     }
 }
 
 /* Multiply by signed curve constant */
 static __inline__ void
 field_mulw_scc (
-    struct field_t* __restrict__ out,
-    const struct field_t *a,
+    field_a_restrict_t out,
+    const field_a_t a,
     int64_t scc
 ) {
     if (scc >= 0) {
@@ -151,8 +154,8 @@ field_mulw_scc (
 /* Multiply by signed curve constant and weak reduce if biased */
 static __inline__ void
 field_mulw_scc_wr (
-    struct field_t* __restrict__ out,
-    const struct field_t *a,
+    field_a_restrict_t out,
+    const field_a_t a,
     int64_t scc
 ) {
     field_mulw_scc(out, a, scc);
@@ -162,9 +165,9 @@ field_mulw_scc_wr (
 
 static __inline__ void
 field_subx_RAW (
-    struct field_t *d,
-    const struct field_t *a,
-    const struct field_t *b
+    field_a_t d,
+    const field_a_t a,
+    const field_a_t b
 ) {
     field_sub_RAW ( d, a, b );
     field_bias( d, 2 );
@@ -173,9 +176,9 @@ field_subx_RAW (
 
 static __inline__ void
 field_sub (
-    struct field_t *d,
-    const struct field_t *a,
-    const struct field_t *b
+    field_a_t d,
+    const field_a_t a,
+    const field_a_t b
 ) {
     field_sub_RAW ( d, a, b );
     field_bias( d, 2 );
@@ -184,9 +187,9 @@ field_sub (
 
 static __inline__ void
 field_add (
-    struct field_t *d,
-    const struct field_t *a,
-    const struct field_t *b
+    field_a_t d,
+    const field_a_t a,
+    const field_a_t b
 ) {
     field_add_RAW ( d, a, b );
     field_weak_reduce ( d );
@@ -194,7 +197,7 @@ field_add (
 
 static __inline__ void
 field_subw (
-    struct field_t *d,
+    field_a_t d,
     word_t c
 ) {
     field_subw_RAW ( d, c );
@@ -203,9 +206,9 @@ field_subw (
 }
 
 static __inline__ void
-field_negx (
-    struct field_t *d,
-    const struct field_t *a
+field_neg (
+    field_a_t d,
+    const field_a_t a
 ) {
     field_neg_RAW ( d, a );
     field_bias( d, 2 );
@@ -218,12 +221,12 @@ field_negx (
 static inline void
 __attribute__((unused,always_inline)) 
 field_cond_neg (
-    field_t *a,
+    field_a_t a,
     mask_t doNegate
 ) {
-	struct field_t negated;
-    field_negx(&negated, a);
-	constant_time_select(a, &negated, a, sizeof(negated), doNegate);
+	field_a_t negated;
+    field_neg(negated, a);
+	constant_time_select(a, negated, a, sizeof(negated), doNegate);
 }
 
 /** Require the warning annotation on raw routines */
diff --git a/src/include/magic.h b/src/include/magic.h
index 4b8394d..1627a6b 100644
--- a/src/include/magic.h
+++ b/src/include/magic.h
@@ -45,7 +45,7 @@
 /**
  * @brief sqrt(d-1), used for point formats and twisting.
  */
-extern const struct field_t sqrt_d_minus_1;
+extern const field_a_t sqrt_d_minus_1;
 
 /**
  * @brief The base point for Goldilocks.
diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h
index bd97cc9..ecb1782 100644
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -90,8 +90,8 @@ struct fixed_base_table_t {
  */
 mask_t
 montgomery_ladder (
-    struct field_t *out,
-    const struct field_t *in,
+    field_a_t out,
+    const field_a_t in,
     const word_t *scalar,
     unsigned int nbits,
     unsigned int n_extra_doubles
diff --git a/src/p448/f_arithmetic.c b/src/p448/f_arithmetic.c
index 82f35b8..c9b87e5 100644
--- a/src/p448/f_arithmetic.c
+++ b/src/p448/f_arithmetic.c
@@ -12,32 +12,32 @@
 
 void 
 field_isr (
-    struct field_t*       a,
-    const struct field_t* x
+    field_a_t a,
+    const field_a_t x
 ) {
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L1,     x );
-    field_mul  (   &L2,     x,   &L1 );
-    field_sqr  (   &L1,   &L2 );
-    field_mul  (   &L2,     x,   &L1 );
-    field_sqrn (   &L1,   &L2,     3 );
-    field_mul  (   &L0,   &L2,   &L1 );
-    field_sqrn (   &L1,   &L0,     3 );
-    field_mul  (   &L0,   &L2,   &L1 );
-    field_sqrn (   &L2,   &L0,     9 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqr  (   &L0,   &L1 );
-    field_mul  (   &L2,     x,   &L0 );
-    field_sqrn (   &L0,   &L2,    18 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqrn (   &L0,   &L2,    37 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqrn (   &L0,   &L1,    37 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqrn (   &L0,   &L1,   111 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqr  (   &L0,   &L2 );
-    field_mul  (   &L1,     x,   &L0 );
-    field_sqrn (   &L0,   &L1,   223 );
-    field_mul  (     a,   &L2,   &L0 );
+    field_a_t L0, L1, L2;
+    field_sqr  (   L1,     x );
+    field_mul  (   L2,     x,   L1 );
+    field_sqr  (   L1,   L2 );
+    field_mul  (   L2,     x,   L1 );
+    field_sqrn (   L1,   L2,     3 );
+    field_mul  (   L0,   L2,   L1 );
+    field_sqrn (   L1,   L0,     3 );
+    field_mul  (   L0,   L2,   L1 );
+    field_sqrn (   L2,   L0,     9 );
+    field_mul  (   L1,   L0,   L2 );
+    field_sqr  (   L0,   L1 );
+    field_mul  (   L2,     x,   L0 );
+    field_sqrn (   L0,   L2,    18 );
+    field_mul  (   L2,   L1,   L0 );
+    field_sqrn (   L0,   L2,    37 );
+    field_mul  (   L1,   L2,   L0 );
+    field_sqrn (   L0,   L1,    37 );
+    field_mul  (   L1,   L2,   L0 );
+    field_sqrn (   L0,   L1,   111 );
+    field_mul  (   L2,   L1,   L0 );
+    field_sqr  (   L0,   L2 );
+    field_mul  (   L1,     x,   L0 );
+    field_sqrn (   L0,   L1,   223 );
+    field_mul  (     a,   L2,   L0 );
 }
diff --git a/src/p448/magic.c b/src/p448/magic.c
index b1e7ca5..20c5fa5 100644
--- a/src/p448/magic.c
+++ b/src/p448/magic.c
@@ -35,17 +35,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
 
 const struct affine_t goldilocks_base_point = {
 #ifdef USE_NEON_PERM
-    {{ 0xaed939f,0xc59d070,0xf0de840,0x5f065c3, 0xf4ba0c7,0xdf73324,0xc170033,0x3a6a26a,
+    {{{ 0xaed939f,0xc59d070,0xf0de840,0x5f065c3, 0xf4ba0c7,0xdf73324,0xc170033,0x3a6a26a,
        0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
-    }},
+    }}},
 #else
-    {{ U56LE(0xf0de840aed939f), U56LE(0xc170033f4ba0c7),
+    {{{ U56LE(0xf0de840aed939f), U56LE(0xc170033f4ba0c7),
        U56LE(0xf3932d94c63d96), U56LE(0x9cecfa96147eaa),
        U56LE(0x5f065c3c59d070), U56LE(0x3a6a26adf73324),
        U56LE(0x1b4faff4609845), U56LE(0x297ea0ea2692ff)
-    }},
+    }}},
 #endif
-    {{ 19 }}
+    {{{ 19 }}}
 };
 
 static const word_t curve_prime_order_lo[(224+WORD_BITS-1)/WORD_BITS] = {
@@ -61,8 +61,8 @@ const struct barrett_prime_t curve_prime_order = {
     curve_prime_order_lo
 };
 
-const struct field_t
-sqrt_d_minus_1 = {{
+const field_a_t
+sqrt_d_minus_1 = {{{
 #ifdef USE_NEON_PERM
     0x6749f46,0x24d9770,0xd2e2183,0xa49f7b4,
     0xb4f0179,0x8c5f656,0x888db42,0xdcac462,
@@ -78,4 +78,4 @@ sqrt_d_minus_1 = {{
     U56LE(0x49443b8748734a),
     U56LE(0x12fec0c0b25b7a)
 #endif
-}};
+}}};
diff --git a/src/p480/f_arithmetic.c b/src/p480/f_arithmetic.c
index d616e42..bc8e657 100644
--- a/src/p480/f_arithmetic.c
+++ b/src/p480/f_arithmetic.c
@@ -12,32 +12,32 @@
 
 void 
 field_isr (
-    struct field_t*       a,
-    const struct field_t* x
+    field_a_t a,
+    const field_a_t x
 ) {
-    struct field_t L0, L1, L2, L3;
-    field_sqr  (   &L2,     x );
-    field_mul  (   &L1,     x,   &L2 );
-    field_sqrn (   &L0,   &L1,     2 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqrn (   &L0,   &L2,     4 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqr  (   &L0,   &L1 );
-    field_mul  (   &L2,     x,   &L0 );
-    field_sqrn (   &L0,   &L2,     8 );
-    field_mul  (   &L2,   &L1,   &L0 );
-    field_sqrn (   &L0,   &L2,    17 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqrn (   &L0,   &L1,    17 );
-    field_mul  (   &L1,   &L2,   &L0 );
-    field_sqrn (   &L3,   &L1,    17 );
-    field_mul  (   &L0,   &L2,   &L3 );
-    field_sqrn (   &L2,   &L0,    51 );
-    field_mul  (   &L0,   &L1,   &L2 );
-    field_sqrn (   &L1,   &L0,   119 );
-    field_mul  (   &L2,   &L0,   &L1 );
-    field_sqr  (   &L0,   &L2 );
-    field_mul  (   &L1,     x,   &L0 );
-    field_sqrn (   &L0,   &L1,   239 );
-    field_mul  (     a,   &L2,   &L0 );
+    field_a_t L0, L1, L2, L3;
+    field_sqr  (   L2,     x );
+    field_mul  (   L1,     x,   L2 );
+    field_sqrn (   L0,   L1,     2 );
+    field_mul  (   L2,   L1,   L0 );
+    field_sqrn (   L0,   L2,     4 );
+    field_mul  (   L1,   L2,   L0 );
+    field_sqr  (   L0,   L1 );
+    field_mul  (   L2,     x,   L0 );
+    field_sqrn (   L0,   L2,     8 );
+    field_mul  (   L2,   L1,   L0 );
+    field_sqrn (   L0,   L2,    17 );
+    field_mul  (   L1,   L2,   L0 );
+    field_sqrn (   L0,   L1,    17 );
+    field_mul  (   L1,   L2,   L0 );
+    field_sqrn (   L3,   L1,    17 );
+    field_mul  (   L0,   L2,   L3 );
+    field_sqrn (   L2,   L0,    51 );
+    field_mul  (   L0,   L1,   L2 );
+    field_sqrn (   L1,   L0,   119 );
+    field_mul  (   L2,   L0,   L1 );
+    field_sqr  (   L0,   L2 );
+    field_mul  (   L1,     x,   L0 );
+    field_sqrn (   L0,   L1,   239 );
+    field_mul  (     a,   L2,   L0 );
 }
diff --git a/src/p480/magic.c b/src/p480/magic.c
index ee90a0a..8615071 100644
--- a/src/p480/magic.c
+++ b/src/p480/magic.c
@@ -36,7 +36,7 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
 };
 
 const struct affine_t goldilocks_base_point = {
-    {{
+    {{{
         U60LE(0x849ff7f845c30d3),
         U60LE(0x7dda488553a4c5b),
         U60LE(0x1d3a2d9844831ea),
@@ -45,8 +45,8 @@ const struct affine_t goldilocks_base_point = {
         U60LE(0xfc955e59aeefa65),
         U60LE(0x3ab247cd530013c),
         U60LE(0x7ca42af3d564280)
-    }},
-    {{ 5 }}
+    }}},
+    {{{ 5 }}}
 };
 
 static const word_t curve_prime_order_lo[(240+WORD_BITS-1)/WORD_BITS] = {
@@ -62,7 +62,7 @@ const struct barrett_prime_t curve_prime_order = {
     curve_prime_order_lo
 };
 
-const struct field_t
-sqrt_d_minus_1 = {{
+const field_a_t
+sqrt_d_minus_1 = {{{
     232 /* Whoa, it comes out even. */
-}};
+}}};
diff --git a/src/p521/f_arithmetic.c b/src/p521/f_arithmetic.c
index 7fbdfb8..37c0b50 100644
--- a/src/p521/f_arithmetic.c
+++ b/src/p521/f_arithmetic.c
@@ -12,32 +12,32 @@
 
 void 
 field_isr (
-    struct field_t*       a,
-    const struct field_t* x
+    field_a_t a,
+    const field_a_t x
 ) {
-    struct field_t L0, L1, L2;
-    field_sqr  (   &L1,     x );
-    field_mul  (   &L0,     x,   &L1 );
-    field_sqrn (   &L2,   &L0,     2 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqrn (   &L2,   &L1,     4 );
-    field_mul  (   &L0,   &L1,   &L2 );
-    field_sqrn (   &L2,   &L0,     8 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqrn (   &L2,   &L1,    16 );
-    field_mul  (   &L0,   &L1,   &L2 );
-    field_sqrn (   &L2,   &L0,    32 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqr  (   &L2,   &L1 );
-    field_mul  (   &L0,     x,   &L2 );
-    field_sqrn (   &L2,   &L0,    64 );
-    field_mul  (   &L0,   &L1,   &L2 );
-    field_sqrn (   &L2,   &L0,   129 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqr  (   &L2,   &L1 );
-    field_mul  (   &L0,     x,   &L2 );
-    field_sqrn (   &L2,   &L0,   259 );
-    field_mul  (   &L1,   &L0,   &L2 );
-    field_sqr  (   &L0,   &L1 );
-    field_mul  (     a,     x,   &L0 );
+    field_a_t L0, L1, L2;
+    field_sqr  (   L1,     x );
+    field_mul  (   L0,     x,   L1 );
+    field_sqrn (   L2,   L0,     2 );
+    field_mul  (   L1,   L0,   L2 );
+    field_sqrn (   L2,   L1,     4 );
+    field_mul  (   L0,   L1,   L2 );
+    field_sqrn (   L2,   L0,     8 );
+    field_mul  (   L1,   L0,   L2 );
+    field_sqrn (   L2,   L1,    16 );
+    field_mul  (   L0,   L1,   L2 );
+    field_sqrn (   L2,   L0,    32 );
+    field_mul  (   L1,   L0,   L2 );
+    field_sqr  (   L2,   L1 );
+    field_mul  (   L0,     x,   L2 );
+    field_sqrn (   L2,   L0,    64 );
+    field_mul  (   L0,   L1,   L2 );
+    field_sqrn (   L2,   L0,   129 );
+    field_mul  (   L1,   L0,   L2 );
+    field_sqr  (   L2,   L1 );
+    field_mul  (   L0,     x,   L2 );
+    field_sqrn (   L2,   L0,   259 );
+    field_mul  (   L1,   L0,   L2 );
+    field_sqr  (   L0,   L1 );
+    field_mul  (     a,     x,   L0 );
 }
diff --git a/src/p521/magic.c b/src/p521/magic.c
index 93ccc33..f8ab264 100644
--- a/src/p521/magic.c
+++ b/src/p521/magic.c
@@ -39,7 +39,7 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
 };
 
 const struct affine_t goldilocks_base_point = {
-    {{
+    {{{
 #ifdef USE_P521_3x3_TRANSPOSE
         U58LE(0x02a940a2f19ba6c),
         U58LE(0x3331c90d2c6ba52),
@@ -64,8 +64,8 @@ const struct affine_t goldilocks_base_point = {
         U58LE(0x06277e432c8a5ac),
         U58LE(0x0752cb45c48648b)
 #endif
-    }},
-    {{ 12 }}
+    }}},
+    {{{ 12 }}}
 };
 
 static const word_t curve_prime_order_lo[(261+WORD_BITS-1)/WORD_BITS] = {
@@ -82,8 +82,8 @@ const struct barrett_prime_t curve_prime_order = {
     curve_prime_order_lo
 };
 
-const struct field_t
-sqrt_d_minus_1 = {{
+const field_a_t
+sqrt_d_minus_1 = {{{
 #ifdef USE_P521_3x3_TRANSPOSE
     U58LE(0x1e2be72c1c81990),
     U58LE(0x207dfc238a33e46),
@@ -108,4 +108,4 @@ sqrt_d_minus_1 = {{
     U58LE(0x0524b9e715937f5),
     U58LE(0x0a9ea3ac10d6aed)
 #endif
-}};
+}}};
diff --git a/src/scalarmul.c b/src/scalarmul.c
index b85a42c..cf95984 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -15,8 +15,8 @@
 
 mask_t
 montgomery_ladder (
-    struct field_t *out,
-    const struct field_t *in,
+    field_a_t out,
+    const field_a_t in,
     const word_t *scalar,
     unsigned int nbits,
     unsigned int n_extra_doubles
@@ -30,15 +30,15 @@ montgomery_ladder (
         word_t w = scalar[j];
         for (i=n; i>=0; i--) {
             mask_t flip = -((w>>i)&1);
-            constant_time_cond_swap(&mont.xa,&mont.xd,sizeof(mont.xd),flip^pflip);
-            constant_time_cond_swap(&mont.za,&mont.zd,sizeof(mont.xd),flip^pflip);
+            constant_time_cond_swap(mont.xa,mont.xd,sizeof(mont.xd),flip^pflip);
+            constant_time_cond_swap(mont.za,mont.zd,sizeof(mont.xd),flip^pflip);
             montgomery_step(&mont);
             pflip = flip;
         }
         n = WORD_BITS-1;
     }
-    constant_time_cond_swap(&mont.xa,&mont.xd,sizeof(mont.xd),pflip);
-    constant_time_cond_swap(&mont.za,&mont.zd,sizeof(mont.xd),pflip);
+    constant_time_cond_swap(mont.xa,mont.xd,sizeof(mont.xd),pflip);
+    constant_time_cond_swap(mont.za,mont.zd,sizeof(mont.xd),pflip);
     
     assert(n_extra_doubles < INT_MAX);
     for (j=0; j<(int)n_extra_doubles; j++) {
@@ -475,8 +475,8 @@ precompute_fixed_base (
     struct tw_pniels_t pn_tmp;
   
     struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
-    struct field_t *zs  = (struct field_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
-    struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
+    field_a_t *zs  = (field_a_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
+    field_a_t *zis = (field_a_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
     
     struct tw_niels_t *table = prealloc;
     if (prealloc) {
@@ -562,7 +562,7 @@ precompute_fixed_base (
 
             convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
             copy_tw_niels(&table[idx], &pn_tmp.n);
-            field_copy(&zs[idx], &pn_tmp.z);
+            field_copy(zs[idx], pn_tmp.z);
 			
             if (j >= (1u<<(t-1)) - 1) break;
             int delta = (j+1) ^ ((j+1)>>1) ^ gray;
@@ -584,22 +584,22 @@ precompute_fixed_base (
 	
     field_simultaneous_invert(zis, zs, n<<(t-1));
 
-    field_t product;
+    field_a_t product;
     for (i=0; i<n<<(t-1); i++) {
-        field_mul(&product, &table[i].a, &zis[i]);
-        field_strong_reduce(&product);
-        field_copy(&table[i].a, &product);
+        field_mul(product, table[i].a, zis[i]);
+        field_strong_reduce(product);
+        field_copy(table[i].a, product);
         
-        field_mul(&product, &table[i].b, &zis[i]);
-        field_strong_reduce(&product);
-        field_copy(&table[i].b, &product);
+        field_mul(product, table[i].b, zis[i]);
+        field_strong_reduce(product);
+        field_copy(table[i].b, product);
         
-        field_mul(&product, &table[i].c, &zis[i]);
-        field_strong_reduce(&product);
-        field_copy(&table[i].c, &product);
+        field_mul(product, table[i].c, zis[i]);
+        field_strong_reduce(product);
+        field_copy(table[i].c, product);
     }
 	
-	mask_t ret = ~field_is_zero(&zis[0]);
+	mask_t ret = ~field_is_zero(zis[0]);
 
     free(doubles);
     free(zs);
@@ -635,8 +635,8 @@ precompute_fixed_base_wnaf (
     unsigned int tbits
 ) {
     int i;
-    struct field_t *zs  = (struct field_t *) malloc_vector(sizeof(*zs)<<tbits);
-    struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis)<<tbits);
+    field_a_t *zs  = (field_a_t *) malloc_vector(sizeof(*zs)<<tbits);
+    field_a_t *zis = (field_a_t *) malloc_vector(sizeof(*zis)<<tbits);
 
     if (!zs || !zis) {
         free(zs);
@@ -650,7 +650,7 @@ precompute_fixed_base_wnaf (
     struct tw_pniels_t twop, tmp;
     
     convert_tw_extensible_to_tw_pniels(&tmp, &base);
-    field_copy(&zs[0], &tmp.z);
+    field_copy(zs[0], tmp.z);
     copy_tw_niels(&out[0], &tmp.n);
 
     if (tbits > 0) {
@@ -659,32 +659,32 @@ precompute_fixed_base_wnaf (
         add_tw_pniels_to_tw_extensible(&base, &tmp);
         
         convert_tw_extensible_to_tw_pniels(&tmp, &base);
-        field_copy(&zs[1], &tmp.z);
+        field_copy(zs[1], tmp.z);
         copy_tw_niels(&out[1], &tmp.n);
 
         for (i=2; i < 1<<tbits; i++) {
             add_tw_pniels_to_tw_extensible(&base, &twop);
             convert_tw_extensible_to_tw_pniels(&tmp, &base);
-            field_copy(&zs[i], &tmp.z);
+            field_copy(zs[i], tmp.z);
             copy_tw_niels(&out[i], &tmp.n);
         }
     }
     
     field_simultaneous_invert(zis, zs, 1<<tbits);
 
-    field_t product;
+    field_a_t product;
     for (i=0; i<1<<tbits; i++) {
-        field_mul(&product, &out[i].a, &zis[i]);
-        field_strong_reduce(&product);
-        field_copy(&out[i].a, &product);
+        field_mul(product, out[i].a, zis[i]);
+        field_strong_reduce(product);
+        field_copy(out[i].a, product);
         
-        field_mul(&product, &out[i].b, &zis[i]);
-        field_strong_reduce(&product);
-        field_copy(&out[i].b, &product);
+        field_mul(product, out[i].b, zis[i]);
+        field_strong_reduce(product);
+        field_copy(out[i].b, product);
         
-        field_mul(&product, &out[i].c, &zis[i]);
-        field_strong_reduce(&product);
-        field_copy(&out[i].c, &product);
+        field_mul(product, out[i].c, zis[i]);
+        field_strong_reduce(product);
+        field_copy(out[i].c, product);
     }
 
     free(zs);
diff --git a/test/bench.c b/test/bench.c
index 31fd9eb..350415b 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -29,7 +29,7 @@ static double now(void) {
   return tv.tv_sec + tv.tv_usec/1000000.0;
 }
 
-static void field_randomize( struct crandom_state_t *crand, struct field_t *a ) {
+static void field_randomize( struct crandom_state_t *crand, field_a_t a ) {
     crandom_generate(crand, (unsigned char *)a, sizeof(*a));
     field_strong_reduce(a);
 }
@@ -38,7 +38,7 @@ static void q448_randomize( struct crandom_state_t *crand, word_t sk[SCALAR_WORD
     crandom_generate(crand, (unsigned char *)sk, SCALAR_BYTES);
 }
 
-static void field_print( const char *descr, const struct field_t *a ) {
+static void field_print( const char *descr, const field_a_t a ) {
     int j;
     unsigned char ser[FIELD_BYTES];
     field_serialize(ser,a);
@@ -52,7 +52,7 @@ static void field_print( const char *descr, const struct field_t *a ) {
 static void __attribute__((unused))
 field_print_full (
     const char *descr,
-    const struct field_t *a
+    const field_a_t a
 ) {
     int j;
     printf("%s = 0x", descr);
@@ -86,7 +86,7 @@ int main(int argc, char **argv) {
     struct tw_pniels_t pniels;
     struct affine_t affine;
     struct montgomery_t mb;
-    struct field_t a,b,c,d;
+    field_a_t a,b,c,d;
     
     
     double when;
@@ -106,42 +106,42 @@ int main(int argc, char **argv) {
     word_t sk[SCALAR_WORDS],tk[SCALAR_WORDS];
     q448_randomize(&crand, sk);
     
-    memset(&a,0,sizeof(a));
-    memset(&b,0,sizeof(b));
-    memset(&c,0,sizeof(c));
-    memset(&d,0,sizeof(d));
+    memset(a,0,sizeof(a));
+    memset(b,0,sizeof(b));
+    memset(c,0,sizeof(c));
+    memset(d,0,sizeof(d));
     when = now();
     for (i=0; i<nbase*5000; i++) {
-        field_mul(&c, &b, &a);
+        field_mul(c, b, a);
     }
     when = now() - when;
     printf("mul:         %5.1fns\n", when * 1e9 / i);
     
     when = now();
     for (i=0; i<nbase*5000; i++) {
-        field_sqr(&c, &a);
+        field_sqr(c, a);
     }
     when = now() - when;
     printf("sqr:         %5.1fns\n", when * 1e9 / i);
     
     when = now();
     for (i=0; i<nbase*5000; i++) {
-        field_mulw(&c, &b, 1234562);
+        field_mulw(c, b, 1234562);
     }
     when = now() - when;
     printf("mulw:        %5.1fns\n", when * 1e9 / i);
     
     when = now();
     for (i=0; i<nbase*500; i++) {
-        field_mul(&c, &b, &a);
-        field_mul(&a, &b, &c);
+        field_mul(c, b, a);
+        field_mul(a, b, c);
     }
     when = now() - when;
     printf("mul dep:     %5.1fns\n", when * 1e9 / i / 2);
     
     when = now();
     for (i=0; i<nbase*10; i++) {
-        field_randomize(&crand, &a);
+        field_randomize(&crand, a);
     }
     when = now() - when;
     printf("rand448:     %5.1fns\n", when * 1e9 / i);
@@ -165,46 +165,46 @@ int main(int argc, char **argv) {
     
     when = now();
     for (i=0; i<nbase; i++) {
-        field_isr(&c, &a);
+        field_isr(c, a);
     }
     when = now() - when;
     printf("isr auto:    %5.1fµs\n", when * 1e6 / i);
     
     for (i=0; i<100; i++) {
-        field_randomize(&crand, &a);
-        field_isr(&d,&a);
-        field_sqr(&b,&d);
-        field_mul(&c,&b,&a);
-        field_sqr(&b,&c);
-        field_subw(&b,1);
-        if (!field_is_zero(&b)) {
+        field_randomize(&crand, a);
+        field_isr(d,a);
+        field_sqr(b,d);
+        field_mul(c,b,a);
+        field_sqr(b,c);
+        field_subw(b,1);
+        if (!field_is_zero(b)) {
             printf("ISR validation failure!\n");
-            field_print("a", &a);
-            field_print("s", &d);
+            field_print("a", a);
+            field_print("s", d);
         }
     }
     
     when = now();
     for (i=0; i<nbase; i++) {
-        elligator_2s_inject(&affine, &a);
+        elligator_2s_inject(&affine, a);
     }
     when = now() - when;
     printf("elligator:   %5.1fµs\n", when * 1e6 / i);
     
     for (i=0; i<100; i++) {
-        field_randomize(&crand, &a);
-        elligator_2s_inject(&affine, &a);
+        field_randomize(&crand, a);
+        elligator_2s_inject(&affine, a);
         if (!validate_affine(&affine)) {
             printf("Elligator validation failure!\n");
-            field_print("a", &a);
-            field_print("x", &affine.x);
-            field_print("y", &affine.y);
+            field_print("a", a);
+            field_print("x", affine.x);
+            field_print("y", affine.y);
         }
     }
     
     when = now();
     for (i=0; i<nbase; i++) {
-        deserialize_affine(&affine, &a);
+        deserialize_affine(&affine, a);
     }
     when = now() - when;
     printf("decompress:  %5.1fµs\n", when * 1e6 / i);
@@ -212,34 +212,34 @@ int main(int argc, char **argv) {
     convert_affine_to_extensible(&exta, &affine);
     when = now();
     for (i=0; i<nbase; i++) {
-        serialize_extensible(&a, &exta);
+        serialize_extensible(a, &exta);
     }
     when = now() - when;
     printf("compress:    %5.1fµs\n", when * 1e6 / i);
     
     int goods = 0;
     for (i=0; i<100; i++) {
-        field_randomize(&crand, &a);
-        mask_t good = deserialize_affine(&affine, &a);
+        field_randomize(&crand, a);
+        mask_t good = deserialize_affine(&affine, a);
         if (good & !validate_affine(&affine)) {
             printf("Deserialize validation failure!\n");
-            field_print("a", &a);
-            field_print("x", &affine.x);
-            field_print("y", &affine.y);
+            field_print("a", a);
+            field_print("x", affine.x);
+            field_print("y", affine.y);
         } else if (good) {
             goods++;
             convert_affine_to_extensible(&exta,&affine);
-            serialize_extensible(&b, &exta);
-            field_sub(&c,&b,&a);
-            if (!field_is_zero(&c)) {
+            serialize_extensible(b, &exta);
+            field_sub(c,b,a);
+            if (!field_is_zero(c)) {
                 printf("Reserialize validation failure!\n");
-                field_print("a", &a);
-                field_print("x", &affine.x);
-                field_print("y", &affine.y);
-                deserialize_affine(&affine, &b);
-                field_print("b", &b);
-                field_print("x", &affine.x);
-                field_print("y", &affine.y);
+                field_print("a", a);
+                field_print("x", affine.x);
+                field_print("y", affine.y);
+                deserialize_affine(&affine, b);
+                field_print("b", b);
+                field_print("x", affine.x);
+                field_print("y", affine.y);
                 printf("\n");
             }
         }
@@ -313,7 +313,7 @@ int main(int argc, char **argv) {
 	
     when = now();
     for (i=0; i<nbase/10; i++) {
-        ignore_result(montgomery_ladder(&a,&b,sk,FIELD_BITS,0));
+        ignore_result(montgomery_ladder(a,b,sk,FIELD_BITS,0));
     }
     when = now() - when;
     printf("full ladder: %5.1fµs\n", when * 1e6 / i);
@@ -335,7 +335,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         scalarmul(&ext,sk);
-        untwist_and_double_and_serialize(&a,&ext);
+        untwist_and_double_and_serialize(a,&ext);
     }
     when = now() - when;
     printf("edwards smc: %5.1fµs\n", when * 1e6 / i);
@@ -405,12 +405,12 @@ int main(int argc, char **argv) {
     
     when = now();
     for (i=0; i<nbase/10; i++) {
-        deserialize_affine(&affine, &a);
+        deserialize_affine(&affine, a);
         convert_affine_to_extensible(&exta,&affine);
         twist_and_double(&ext,&exta);
         scalarmul(&ext,sk);
         untwist_and_double(&exta,&ext);
-        serialize_extensible(&b, &exta);
+        serialize_extensible(b, &exta);
     }
     when = now() - when;
     printf("edwards sm:  %5.1fµs\n", when * 1e6 / i);
@@ -418,8 +418,8 @@ int main(int argc, char **argv) {
     struct fixed_base_table_t t_5_5_18, t_3_5_30, t_8_4_14, t_5_3_30, t_15_3_10;
 
     while (1) {
-        field_randomize(&crand, &a);
-        if (deserialize_affine(&affine, &a)) break;
+        field_randomize(&crand, a);
+        if (deserialize_affine(&affine, a)) break;
     }
     convert_affine_to_extensible(&exta,&affine);
     twist_and_double(&ext,&exta);
@@ -616,9 +616,9 @@ int main(int argc, char **argv) {
     
     failures=0; successes = 0;
     for (i=0; i<nbase/10; i++) {
-        field_randomize(&crand, &a);
+        field_randomize(&crand, a);
 		word_t two = 2;
-        mask_t good = montgomery_ladder(&b,&a,&two,2,0);
+        mask_t good = montgomery_ladder(b,a,&two,2,0);
 		if (!good) continue;
 		
 		word_t x,y;
@@ -628,17 +628,17 @@ int main(int argc, char **argv) {
         y = (hword_t)y;
         word_t z=x*y;
         
-    	ignore_result(montgomery_ladder(&b,&a,&x,WORD_BITS,0));
-        ignore_result(montgomery_ladder(&c,&b,&y,WORD_BITS,0));
-        ignore_result(montgomery_ladder(&b,&a,&z,WORD_BITS,0));
+    	ignore_result(montgomery_ladder(b,a,&x,WORD_BITS,0));
+        ignore_result(montgomery_ladder(c,b,&y,WORD_BITS,0));
+        ignore_result(montgomery_ladder(b,a,&z,WORD_BITS,0));
         
-        field_sub(&d,&b,&c);
-		if (!field_is_zero(&d)) {
+        field_sub(d,b,c);
+		if (!field_is_zero(d)) {
             printf("Odd ladder validation failure %d!\n", ++failures);
-            field_print("a", &a);
+            field_print("a", a);
             printf("x=%"PRIxWORD", y=%"PRIxWORD", z=%"PRIxWORD"\n", x,y,z);
-            field_print("c", &c);
-            field_print("b", &b);
+            field_print("c", c);
+            field_print("b", b);
 			printf("\n");
 		}
 	}
@@ -647,23 +647,23 @@ int main(int argc, char **argv) {
     for (i=0; i<nbase/10; i++) {
         mask_t good;
         do {
-            field_randomize(&crand, &a);
-            good = deserialize_affine(&affine, &a);
+            field_randomize(&crand, a);
+            good = deserialize_affine(&affine, a);
         } while (!good);
         
         convert_affine_to_extensible(&exta,&affine);
         twist_and_double(&ext,&exta);
         untwist_and_double(&exta,&ext);
-        serialize_extensible(&b, &exta);
-        untwist_and_double_and_serialize(&c, &ext);
+        serialize_extensible(b, &exta);
+        untwist_and_double_and_serialize(c, &ext);
         
-        field_sub(&d,&b,&c);
+        field_sub(d,b,c);
         
-        if (good && !field_is_zero(&d)){
+        if (good && !field_is_zero(d)){
             printf("Iso+serial validation failure %d!\n", ++failures);
-            field_print("a", &a);
-            field_print("b", &b);
-            field_print("c", &c);
+            field_print("a", a);
+            field_print("b", b);
+            field_print("c", c);
             printf("\n");
         } else if (good) {
             successes ++;
@@ -675,23 +675,23 @@ int main(int argc, char **argv) {
     
     successes = failures = 0;
     for (i=0; i<nbase/10; i++) {
-        struct field_t aa;
+        field_a_t aa;
         struct tw_extensible_t exu,exv,exw;
         
         mask_t good;
         do {
-            field_randomize(&crand, &a);
-            good = deserialize_affine(&affine, &a);
+            field_randomize(&crand, a);
+            good = deserialize_affine(&affine, a);
             convert_affine_to_extensible(&exta,&affine);
             twist_and_double(&ext,&exta);
         } while (!good);
         do {
-            field_randomize(&crand, &aa);
-            good = deserialize_affine(&affine, &aa);
+            field_randomize(&crand, aa);
+            good = deserialize_affine(&affine, aa);
             convert_affine_to_extensible(&exta,&affine);
             twist_and_double(&exu,&exta);
         } while (!good);
-        field_randomize(&crand, &aa);
+        field_randomize(&crand, aa);
         
         q448_randomize(&crand, sk);
 		if (i==0 || i==2) memset(&sk, 0, sizeof(sk));
@@ -705,23 +705,23 @@ int main(int argc, char **argv) {
         convert_tw_extensible_to_tw_pniels(&pniels, &exw);
         add_tw_pniels_to_tw_extensible(&exv,&pniels);
         untwist_and_double(&exta,&exv);
-        serialize_extensible(&b, &exta);
+        serialize_extensible(b, &exta);
 
         ignore_result(precompute_fixed_base_wnaf(wnaft,&exu,5));
         linear_combo_var_fixed_vt(&ext,sk,FIELD_BITS,tk,FIELD_BITS,wnaft,5);
         untwist_and_double(&exta,&exv);
-        serialize_extensible(&c, &exta);
+        serialize_extensible(c, &exta);
         
-        field_sub(&d,&b,&c);
+        field_sub(d,b,c);
         
-        if (!field_is_zero(&d)){
+        if (!field_is_zero(d)){
             printf("PreWNAF combo validation failure %d!\n", ++failures);
-            field_print("a", &a);
-            field_print("A", &aa);
+            field_print("a", a);
+            field_print("A", aa);
             q448_print("s", sk);
             q448_print("t", tk);
-            field_print("c", &c);
-            field_print("b", &b);
+            field_print("c", c);
+            field_print("b", b);
             printf("\n\n");
         } else if (good) {
             successes ++;
diff --git a/test/test.c b/test/test.c
index d3c41b4..55cb970 100644
--- a/test/test.c
+++ b/test/test.c
@@ -82,7 +82,7 @@ hexprint (
 
 void field_print (
     const char *descr,
-    const struct field_t *a
+    const field_a_t a
 ) {
     int j;
     unsigned char ser[FIELD_BYTES];
diff --git a/test/test.h b/test/test.h
index b491f0e..8fc073e 100644
--- a/test/test.h
+++ b/test/test.h
@@ -20,7 +20,7 @@ hexprint (
     
 void field_print (
     const char *descr,
-    const struct field_t *a
+    const field_a_t a
 );
     
 void scalar_print (
diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c
index 7c45407..d1bc3f2 100644
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -7,7 +7,7 @@
 mpz_t mp_field;
 
 static mask_t mpz_to_field (
-    struct field_t *out,
+    field_a_t out,
     const mpz_t in
 ) {
     uint8_t ser[FIELD_BYTES];
@@ -27,9 +27,9 @@ static inline int BRANCH_ON_CONSTANT(int x) {
 
 static mask_t field_assert_eq_gmp(
     const char *descr,
-    const struct field_t *a,
-    const struct field_t *b,
-    const struct field_t *x,
+    const field_a_t a,
+    const field_a_t b,
+    const field_a_t x,
     const mpz_t y,
     float lowBound,
     float highBound
@@ -88,32 +88,32 @@ static mask_t test_add_sub_RAW (
     const mpz_t y,
     word_t word
 ) {
-    struct field_t xx,yy,tt;
+    field_a_t xx,yy,tt;
     mpz_t t;
     mask_t succ = MASK_SUCCESS;
-    succ  = mpz_to_field(&xx,x);
-    succ &= mpz_to_field(&yy,y);
+    succ  = mpz_to_field(xx,x);
+    succ &= mpz_to_field(yy,y);
     mpz_init(t);
     
-    field_add_RAW(&tt,&xx,&yy);
+    field_add_RAW(tt,xx,yy);
     mpz_add(t,x,y);
-    succ &= field_assert_eq_gmp("add",&xx,&yy,&tt,t,0,2.1);
+    succ &= field_assert_eq_gmp("add",xx,yy,tt,t,0,2.1);
     
-    field_sub_RAW(&tt,&xx,&yy);
-    field_bias(&tt,2);
+    field_sub_RAW(tt,xx,yy);
+    field_bias(tt,2);
     mpz_sub(t,x,y);
-    succ &= field_assert_eq_gmp("sub",&xx,&yy,&tt,t,0,3.1);
+    succ &= field_assert_eq_gmp("sub",xx,yy,tt,t,0,3.1);
     
-    field_copy(&tt,&xx);
-    field_addw(&tt,word);
+    field_copy(tt,xx);
+    field_addw(tt,word);
     mpz_add_ui(t,x,word);
-    succ &= field_assert_eq_gmp("addw",&xx,&yy,&tt,t,0,2.1);
+    succ &= field_assert_eq_gmp("addw",xx,yy,tt,t,0,2.1);
     
-    field_copy(&tt,&xx);
-    field_subw(&tt,word);
-    field_bias(&tt,1);
+    field_copy(tt,xx);
+    field_subw(tt,word);
+    field_bias(tt,1);
     mpz_sub_ui(t,x,word);
-    succ &= field_assert_eq_gmp("subw",&xx,&yy,&tt,t,0,2.1);
+    succ &= field_assert_eq_gmp("subw",xx,yy,tt,t,0,2.1);
 
     /*
     if (!succ) {
@@ -132,32 +132,32 @@ static mask_t test_mul_sqr (
     const mpz_t y,
     word_t word
 ) {
-    struct field_t xx,yy,tt;
+    field_a_t xx,yy,tt;
     mpz_t t;
     mask_t succ = MASK_SUCCESS;
-    succ  = mpz_to_field(&xx,x);
-    succ &= mpz_to_field(&yy,y);
+    succ  = mpz_to_field(xx,x);
+    succ &= mpz_to_field(yy,y);
     mpz_init(t);
     
-    field_mul(&tt,&xx,&yy);
+    field_mul(tt,xx,yy);
     mpz_mul(t,x,y);
-    succ &= field_assert_eq_gmp("mul",&xx,&yy,&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("mul",xx,yy,tt,t,0,1.1);
     
-    field_mulw(&tt,&xx,word);
+    field_mulw(tt,xx,word);
     mpz_mul_ui(t,x,word);
-    succ &= field_assert_eq_gmp("mulw",&xx,&yy,&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("mulw",xx,yy,tt,t,0,1.1);
     
-    field_sqr(&tt,&xx);
+    field_sqr(tt,xx);
     mpz_mul(t,x,x);
-    succ &= field_assert_eq_gmp("sqrx",&xx,&yy,&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("sqrx",xx,yy,tt,t,0,1.1);
     
-    field_sqr(&tt,&yy);
+    field_sqr(tt,yy);
     mpz_mul(t,y,y);
-    succ &= field_assert_eq_gmp("sqy",&xx,&yy,&tt,t,0,1.1);
+    succ &= field_assert_eq_gmp("sqy",xx,yy,tt,t,0,1.1);
     
     if (!succ) {
-        field_print("    x", &xx);
-        field_print("    y", &yy);
+        field_print("    x", xx);
+        field_print("    y", yy);
     }
     
     mpz_clear(t);
@@ -168,28 +168,28 @@ static mask_t test_mul_sqr (
 static mask_t test_isr (
     const mpz_t x
 ) {
-    struct field_t xx,yy,ss,tt;
+    field_a_t xx,yy,ss,tt;
     mask_t succ = 0;
-    succ  = mpz_to_field(&xx,x);
+    succ  = mpz_to_field(xx,x);
     
-    field_isr(&ss,&xx);
-    field_sqr(&tt,&ss);
-    field_mul(&yy,&xx,&tt);
+    field_isr(ss,xx);
+    field_sqr(tt,ss);
+    field_mul(yy,xx,tt);
     
-    field_addw(&tt,1);
-    succ |= field_is_zero(&tt);
+    field_addw(tt,1);
+    succ |= field_is_zero(tt);
     
-    field_subw(&tt,2);
-    field_bias(&tt,1);
-    succ |= field_is_zero(&tt);
+    field_subw(tt,2);
+    field_bias(tt,1);
+    succ |= field_is_zero(tt);
     
-    field_addw(&tt,1);
+    field_addw(tt,1);
     if (~succ) {
         youfail();
         printf("ISR failure.\n");
-        field_print("    x", &xx);
-        field_print("    s", &ss);
-        field_print("    t", &tt);
+        field_print("    x", xx);
+        field_print("    s", ss);
+        field_print("    t", tt);
     }
     
     return succ;
@@ -214,7 +214,7 @@ int test_arithmetic (void) {
     
     mask_t succ = MASK_SUCCESS;
     
-    int radix_bits = sizeof(word_t) * FIELD_BITS / sizeof(field_t);
+    int radix_bits = sizeof(word_t) * FIELD_BITS / sizeof(field_a_t);
     
     for (j=0; j<ntests; j++) {
         if (j<256) {
diff --git a/test/test_pointops.c b/test/test_pointops.c
index 05b13e5..1e40271 100644
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -12,15 +12,15 @@ static void
 failprint_ext (
     const struct extensible_t *a
 ) {
-    struct field_t zi, scaled;
-    field_print("    x", &a->x);
-    field_print("    y", &a->y);
-    field_print("    z", &a->z);
-    field_inverse(&zi, &a->z);
-    field_mul(&scaled, &zi, &a->x);
-    field_print("    X", &scaled);
-    field_mul(&scaled, &zi, &a->y);
-    field_print("    Y", &scaled);
+    field_a_t zi, scaled;
+    field_print("    x", a->x);
+    field_print("    y", a->y);
+    field_print("    z", a->z);
+    field_inverse(zi, a->z);
+    field_mul(scaled, zi, a->x);
+    field_print("    X", scaled);
+    field_mul(scaled, zi, a->y);
+    field_print("    Y", scaled);
     printf("\n");
 }
 
@@ -165,10 +165,10 @@ add_double_test (
     
     if (~succ) {
         printf("    Bases were:\n");
-        field_print("    x1", &base1->x);
-        field_print("    y1", &base1->y);
-        field_print("    x2", &base2->x);
-        field_print("    y2", &base2->y);
+        field_print("    x1", base1->x);
+        field_print("    y1", base1->y);
+        field_print("    x2", base2->x);
+        field_print("    y2", base2->y);
     }
     
     return succ ? 0 : -1;
@@ -211,18 +211,18 @@ single_twisting_test (
         succ = 0;
     } /* FUTURE: quadness */
     
-    field_t sera,serb;
-    untwist_and_double_and_serialize(&sera,&text);
+    field_a_t sera,serb;
+    untwist_and_double_and_serialize(sera,&text);
     copy_extensible(&tmpext,&exb);
     double_extensible(&tmpext);
-    serialize_extensible(&serb,&tmpext);
+    serialize_extensible(serb,&tmpext);
     
     /* check that their (doubled; FUTURE?) serializations are equal */
-    if (~field_eq(&sera,&serb)) {
+    if (~field_eq(sera,serb)) {
         youfail();
         printf("    Different serialization from twist + double ()\n");
-        field_print("    t", &sera);
-        field_print("    b", &serb);
+        field_print("    t", sera);
+        field_print("    b", serb);
         succ = 0;
     }
     
@@ -242,8 +242,8 @@ single_twisting_test (
     
     if (~succ) {
         printf("    Base was:\n");
-        field_print("    x", &base->x);
-        field_print("    y", &base->y);
+        field_print("    x", base->x);
+        field_print("    y", base->y);
     }
     
     
@@ -252,7 +252,7 @@ single_twisting_test (
 
 int test_pointops (void) {
     struct affine_t base, pbase;
-    struct field_t serf;
+    field_a_t serf;
     
     struct crandom_state_t crand;
     crandom_init_from_buffer(&crand, "test_pointops random initializer");
@@ -277,7 +277,7 @@ int test_pointops (void) {
         #endif
         
         /* TODO: we need a field generate, which can return random or pathological. */
-        mask_t succ = field_deserialize(&serf, ser);
+        mask_t succ = field_deserialize(serf, ser);
         if (!succ) {
             youfail();
             printf("   Unlikely: fail at field_deserialize\n");
@@ -287,7 +287,7 @@ int test_pointops (void) {
         if (i) {
             copy_affine(&pbase, &base);
         }
-        elligator_2s_inject(&base, &serf);
+        elligator_2s_inject(&base, serf);
         
         if (i) {
             ret = add_double_test(&base, &pbase);
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
index 89db764..d21be13 100644
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -12,19 +12,19 @@
 /* 0 = succeed, 1 = inval, -1 = fail */
 static int
 single_scalarmul_compatibility_test (
-    const struct field_t *base,
+    const field_a_t base,
     const word_t *scalar,
     int nbits
 ) {
     struct tw_extensible_t text, work;
-    struct field_t mont, ct, vl, vt;
+    field_a_t mont, ct, vl, vt;
     
     int ret = 0, i;
     mask_t succ, succm;
     
-    succ = deserialize_and_twist_approx(&text, &sqrt_d_minus_1, base);
+    succ = deserialize_and_twist_approx(&text, base);
     
-    succm = montgomery_ladder(&mont,base,scalar,nbits,1);
+    succm = montgomery_ladder(mont,base,scalar,nbits,1);
     
     if (succ != succm) {
         youfail();
@@ -52,7 +52,7 @@ single_scalarmul_compatibility_test (
     const int nparams = sizeof(params)/sizeof(params[0]);
     struct fixed_base_table_t fbt;
     const int nsizes = 6;
-    struct field_t fbout[nparams], wout[nsizes];
+    field_a_t fbout[nparams], wout[nsizes];
     memset(&fbt, 0, sizeof(fbt));
     memset(&fbout, 0, sizeof(fbout));
     memset(&wout, 0, sizeof(wout));
@@ -75,7 +75,7 @@ single_scalarmul_compatibility_test (
             continue;
         }
         
-        untwist_and_double_and_serialize(&fbout[i], &work);
+        untwist_and_double_and_serialize(fbout[i], &work);
     }
     
     /* compute using precomp wNAF */
@@ -91,7 +91,7 @@ single_scalarmul_compatibility_test (
         
         scalarmul_fixed_base_wnaf_vt(&work, scalar, nbits, pre, i);
         
-        untwist_and_double_and_serialize(&wout[i], &work);
+        untwist_and_double_and_serialize(wout[i], &work);
     }
     
     mask_t consistent = MASK_SUCCESS;
@@ -100,31 +100,31 @@ single_scalarmul_compatibility_test (
         /* window methods currently only work on FIELD_BITS bits. */
         copy_tw_extensible(&work, &text);
         scalarmul(&work, scalar);
-        untwist_and_double_and_serialize(&ct, &work);
+        untwist_and_double_and_serialize(ct, &work);
         
         copy_tw_extensible(&work, &text);
         scalarmul_vlook(&work, scalar);
-        untwist_and_double_and_serialize(&vl, &work);
+        untwist_and_double_and_serialize(vl, &work);
         
         copy_tw_extensible(&work, &text);
         scalarmul_vt(&work, scalar, nbits);
-        untwist_and_double_and_serialize(&vt, &work);
+        untwist_and_double_and_serialize(vt, &work);
         
     
         /* check consistency mont vs window */
-        consistent &= field_eq(&mont, &ct);
-        consistent &= field_eq(&mont, &vl);
-        consistent &= field_eq(&mont, &vt);
+        consistent &= field_eq(mont, ct);
+        consistent &= field_eq(mont, vl);
+        consistent &= field_eq(mont, vt);
     }
     
     /* check consistency mont vs combs */
     for (i=0; i<nparams; i++) {
-        consistent &= field_eq(&mont,&fbout[i]);
+        consistent &= field_eq(mont,fbout[i]);
     }
     
     /* check consistency mont vs wNAF */
     for (i=0; i<nsizes; i++) {
-        consistent &= field_eq(&mont,&wout[i]);
+        consistent &= field_eq(mont,wout[i]);
     }
     
     /* If inconsistent, complain. */
@@ -133,23 +133,23 @@ single_scalarmul_compatibility_test (
         printf("    Failed scalarmul consistency test with nbits=%d.\n",nbits);
         field_print("    base", base);
         scalar_print("    scal", scalar, (nbits+WORD_BITS-1)/WORD_BITS);
-        field_print("    mont", &mont);
+        field_print("    mont", mont);
         
         for (i=0; i<nparams; i++) {
             printf("    With n=%d, t=%d, s=%d:\n", params[i].n, params[i].t, params[i].s);
-            field_print("    out ", &fbout[i]);
+            field_print("    out ", fbout[i]);
         }
         
         for (i=0; i<nsizes; i++) {
             printf("    With w=%d:\n",i);
-            field_print("    wNAF", &wout[i]);
+            field_print("    wNAF", wout[i]);
         }
         
     
         if (nbits == FIELD_BITS) {
-            field_print("    ct ", &ct);
-            field_print("    vl ", &vl);
-            field_print("    vt ", &vt);
+            field_print("    ct ", ct);
+            field_print("    vl ", vl);
+            field_print("    vt ", vt);
         }
         
         ret = -1;
@@ -160,20 +160,20 @@ single_scalarmul_compatibility_test (
 
 static int
 single_linear_combo_test (
-    const struct field_t *base1,
+    const field_a_t base1,
     const word_t *scalar1,
     int nbits1,
-    const struct field_t *base2,
+    const field_a_t base2,
     const word_t *scalar2,
     int nbits2
 ) { 
     struct tw_extensible_t text1, text2, working;
     struct tw_pniels_t pn;
-    struct field_t result_comb, result_combo, result_wnaf;
+    field_a_t result_comb, result_combo, result_wnaf;
     
     mask_t succ = 
-        deserialize_and_twist_approx(&text1, &sqrt_d_minus_1, base1)
-      & deserialize_and_twist_approx(&text2, &sqrt_d_minus_1, base2);
+        deserialize_and_twist_approx(&text1, base1)
+      & deserialize_and_twist_approx(&text2, base2);
     if (!succ) return 1;
     
     struct fixed_base_table_t t1, t2;
@@ -194,22 +194,22 @@ single_linear_combo_test (
     /* use the dedicated wNAF linear combo algorithm */
     copy_tw_extensible(&working, &text1);
     linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5);
-    untwist_and_double_and_serialize(&result_wnaf, &working);
+    untwist_and_double_and_serialize(result_wnaf, &working);
     
     /* use the dedicated combs algorithm */
     succ &= linear_combo_combs_vt(&working, scalar1, nbits1, &t1, scalar2, nbits2, &t2);
-    untwist_and_double_and_serialize(&result_combo, &working);
+    untwist_and_double_and_serialize(result_combo, &working);
     
     /* use two combs */
     succ &= scalarmul_fixed_base(&working, scalar1, nbits1, &t1);
     convert_tw_extensible_to_tw_pniels(&pn, &working);
     succ &= scalarmul_fixed_base(&working, scalar2, nbits2, &t2);
     add_tw_pniels_to_tw_extensible(&working, &pn);
-    untwist_and_double_and_serialize(&result_comb, &working);
+    untwist_and_double_and_serialize(result_comb, &working);
     
     mask_t consistent = MASK_SUCCESS;
-    consistent &= field_eq(&result_combo, &result_wnaf);
-    consistent &= field_eq(&result_comb,  &result_wnaf);
+    consistent &= field_eq(result_combo, result_wnaf);
+    consistent &= field_eq(result_comb,  result_wnaf);
     
     if (!succ || !consistent) {
         youfail();
@@ -219,9 +219,9 @@ single_linear_combo_test (
         scalar_print("    scal1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
         field_print("    base2", base2);
         scalar_print("    scal2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
-        field_print("    combs", &result_comb);
-        field_print("    combo", &result_combo);
-        field_print("    wNAFs", &result_wnaf);
+        field_print("    combs", result_comb);
+        field_print("    combo", result_combo);
+        field_print("    wNAFs", result_wnaf);
         return -1;
     }
     
@@ -234,7 +234,7 @@ single_linear_combo_test (
 /* 0 = succeed, 1 = inval, -1 = fail */
 static int
 single_scalarmul_commutativity_test (
-    const struct field_t *base,
+    const field_a_t base,
     const word_t *scalar1,
     int nbits1,
     int ned1,
@@ -242,12 +242,12 @@ single_scalarmul_commutativity_test (
     int nbits2,
     int ned2
 ) {
-    struct field_t m12, m21, tmp1, tmp2;
-    mask_t succ12a = montgomery_ladder(&tmp1,base,scalar1,nbits1,ned1);
-    mask_t succ12b = montgomery_ladder(&m12,&tmp1,scalar2,nbits2,ned2);
+    field_a_t m12, m21, tmp1, tmp2;
+    mask_t succ12a = montgomery_ladder(tmp1,base,scalar1,nbits1,ned1);
+    mask_t succ12b = montgomery_ladder(m12,tmp1,scalar2,nbits2,ned2);
     
-    mask_t succ21a = montgomery_ladder(&tmp2,base,scalar2,nbits2,ned2);
-    mask_t succ21b = montgomery_ladder(&m21,&tmp2,scalar1,nbits1,ned1);
+    mask_t succ21a = montgomery_ladder(tmp2,base,scalar2,nbits2,ned2);
+    mask_t succ21b = montgomery_ladder(m21,tmp2,scalar1,nbits1,ned1);
     
     mask_t succ12 = succ12a & succ12b, succ21 = succ21a & succ21b;
     
@@ -256,8 +256,8 @@ single_scalarmul_commutativity_test (
         printf("    Failed scalarmul commutativity test with (nbits,ned) = (%d,%d), (%d,%d).\n",
             nbits1,ned1,nbits2,ned2);
         field_print("    base", base);
-        field_print("    tmp1", &tmp1);
-        field_print("    tmp2", &tmp2);
+        field_print("    tmp1", tmp1);
+        field_print("    tmp2", tmp2);
         scalar_print("    sca1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
         scalar_print("    sca2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
         printf("    good = ((%d,%d),(%d,%d))\n", (int)-succ12a,
@@ -269,7 +269,7 @@ single_scalarmul_commutativity_test (
         return 1;
     }
     
-    mask_t consistent = field_eq(&m12,&m21);
+    mask_t consistent = field_eq(m12,m21);
     if (consistent) {
         return 0;
     } else {
@@ -279,8 +279,8 @@ single_scalarmul_commutativity_test (
         field_print("    base", base);
         scalar_print("    sca1", scalar1, (nbits1+WORD_BITS-1)/WORD_BITS);
         scalar_print("    sca2", scalar2, (nbits1+WORD_BITS-1)/WORD_BITS);
-        field_print("    m12 ", &m12);
-        field_print("    m21 ", &m21);
+        field_print("    m12 ", m12);
+        field_print("    m21 ", m21);
         return -1;
     }
 }

From 0a4593eddaf8187b462386403f8713d01b06d390 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 16:22:37 -0800
Subject: [PATCH 04/15] respace

---
 src/arithmetic.c |  20 +-
 src/ec_point.c   | 768 +++++++++++++++++++++++------------------------
 2 files changed, 394 insertions(+), 394 deletions(-)

diff --git a/src/arithmetic.c b/src/arithmetic.c
index 89be5c4..dee62e7 100644
--- a/src/arithmetic.c
+++ b/src/arithmetic.c
@@ -9,7 +9,7 @@
  */
 
 #include "field.h"
-#include "ec_point.h" // TODO
+#include "ec_point.h" 
 
 mask_t
 field_eq (
@@ -32,10 +32,10 @@ field_inverse (
     const field_a_t x
 ) {
     field_a_t L0, L1;
-    field_isr  (   L0,     x );
-    field_sqr  (   L1,   L0 );
-    field_sqr  (   L0,   L1 );
-    field_mul  (     a,     x,   L0 );
+    field_isr ( L0, x );
+    field_sqr ( L1, L0 );
+    field_sqr ( L0, L1 );
+    field_mul ( a, x, L0 );
 }
 
 mask_t
@@ -43,11 +43,11 @@ field_is_square (
     const field_a_t x
 ) {
     field_a_t L0, L1;
-    field_isr  (   L0,     x );
-    field_sqr  (   L1,   L0 );
-    field_mul  (   L0,     x,  L1 );
-    field_subw(   L0,     1 );
-    return field_is_zero(   L0 ) | field_is_zero(     x );
+    field_isr ( L0, x );
+    field_sqr ( L1, L0 );
+    field_mul ( L0, x, L1 );
+    field_subw( L0, 1 );
+    return field_is_zero( L0 ) | field_is_zero( x );
 }
 
 void
diff --git a/src/ec_point.c b/src/ec_point.c
index 905ba60..e78852b 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -19,19 +19,19 @@ add_tw_niels_to_tw_extensible (
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
-    field_sub (   L1, d->y, d->x );
-    field_mul  (   L0, e->a,   L1 );
-    field_add_nr  (   L1, d->x, d->y );
-    field_mul  ( d->y, e->b,   L1 );
-    field_mul  (   L1, d->u, d->t );
-    field_mul  ( d->x, e->c,   L1 );
-    field_add_nr  ( d->u,   L0, d->y );
-    field_subx_nr ( d->t, d->y,   L0 );
+    field_sub ( L1, d->y, d->x );
+    field_mul ( L0, e->a, L1 );
+    field_add_nr ( L1, d->x, d->y );
+    field_mul ( d->y, e->b, L1 );
+    field_mul ( L1, d->u, d->t );
+    field_mul ( d->x, e->c, L1 );
+    field_add_nr ( d->u, L0, d->y );
+    field_subx_nr ( d->t, d->y, L0 );
     field_subx_nr ( d->y, d->z, d->x );
-    field_add_nr  (   L0, d->x, d->z );
-    field_mul  ( d->z,   L0, d->y );
-    field_mul  ( d->x, d->y, d->t );
-    field_mul  ( d->y,   L0, d->u );
+    field_add_nr ( L0, d->x, d->z );
+    field_mul ( d->z, L0, d->y );
+    field_mul ( d->x, d->y, d->t );
+    field_mul ( d->y, L0, d->u );
 }
 
 void
@@ -41,19 +41,19 @@ sub_tw_niels_from_tw_extensible (
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
-    field_subx_nr (   L1, d->y, d->x );
-    field_mul  (   L0, e->b,   L1 );
-    field_add_nr  (   L1, d->x, d->y );
-    field_mul  ( d->y, e->a,   L1 );
-    field_mul  (   L1, d->u, d->t );
-    field_mul  ( d->x, e->c,   L1 );
-    field_add_nr  ( d->u,   L0, d->y );
-    field_subx_nr ( d->t, d->y,   L0 );
-    field_add_nr  ( d->y, d->x, d->z );
-    field_subx_nr (   L0, d->z, d->x );
-    field_mul  ( d->z,   L0, d->y );
-    field_mul  ( d->x, d->y, d->t );
-    field_mul  ( d->y,   L0, d->u );
+    field_subx_nr ( L1, d->y, d->x );
+    field_mul ( L0, e->b, L1 );
+    field_add_nr ( L1, d->x, d->y );
+    field_mul ( d->y, e->a, L1 );
+    field_mul ( L1, d->u, d->t );
+    field_mul ( d->x, e->c, L1 );
+    field_add_nr ( d->u, L0, d->y );
+    field_subx_nr ( d->t, d->y, L0 );
+    field_add_nr ( d->y, d->x, d->z );
+    field_subx_nr ( L0, d->z, d->x );
+    field_mul ( d->z, L0, d->y );
+    field_mul ( d->x, d->y, d->t );
+    field_mul ( d->y, L0, d->u );
 }
 
 void
@@ -62,9 +62,9 @@ add_tw_pniels_to_tw_extensible (
     const struct tw_pniels_t* a
 ) {
     field_a_t L0;
-    field_mul  (   L0, e->z, a->z );
-    field_copy ( e->z,   L0 );
-    add_tw_niels_to_tw_extensible(     e, &a->n );
+    field_mul ( L0, e->z, a->z );
+    field_copy ( e->z, L0 );
+    add_tw_niels_to_tw_extensible( e, &a->n );
 }
 
 void
@@ -73,9 +73,9 @@ sub_tw_pniels_from_tw_extensible (
     const struct tw_pniels_t* a
 ) {
     field_a_t L0;
-    field_mul  (   L0, e->z, a->z );
-    field_copy ( e->z,   L0 );
-    sub_tw_niels_from_tw_extensible(     e, &a->n );
+    field_mul ( L0, e->z, a->z );
+    field_copy ( e->z, L0 );
+    sub_tw_niels_from_tw_extensible( e, &a->n );
 }
 
 void
@@ -84,23 +84,23 @@ double_tw_extensible (
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1, L2;
-    field_sqr  (   L2, a->x );
-    field_sqr  (   L0, a->y );
-    field_add_nr  ( a->u,   L2,   L0 );
-    field_add_nr  ( a->t, a->y, a->x );
-    field_sqr  (   L1, a->t );
-    field_sub_nr  ( a->t,   L1, a->u );
-    field_bias ( a->t,     3 );
+    field_sqr ( L2, a->x );
+    field_sqr ( L0, a->y );
+    field_add_nr ( a->u, L2, L0 );
+    field_add_nr ( a->t, a->y, a->x );
+    field_sqr ( L1, a->t );
+    field_sub_nr ( a->t, L1, a->u );
+    field_bias ( a->t, 3 );
     IF32( field_weak_reduce( a->t ) );
-    field_subx_nr (   L1,   L0,   L2 );
-    field_sqr  ( a->x, a->z );
-    field_bias ( a->x,     2-is32 /*is32 ? 1 : 2*/ );
-    field_add_nr  ( a->z, a->x, a->x );
-    field_sub_nr  (   L0, a->z,   L1 );
-    IF32( field_weak_reduce(   L0 ) );
-    field_mul  ( a->z,   L1,   L0 );
-    field_mul  ( a->x,   L0, a->t );
-    field_mul  ( a->y,   L1, a->u );
+    field_subx_nr ( L1, L0, L2 );
+    field_sqr ( a->x, a->z );
+    field_bias ( a->x, 2-is32 /*is32 ? 1 : 2*/ );
+    field_add_nr ( a->z, a->x, a->x );
+    field_sub_nr ( L0, a->z, L1 );
+    IF32( field_weak_reduce( L0 ) );
+    field_mul ( a->z, L1, L0 );
+    field_mul ( a->x, L0, a->t );
+    field_mul ( a->y, L1, a->u );
 }
 
 void
@@ -109,23 +109,23 @@ double_extensible (
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1, L2;
-    field_sqr  (   L2, a->x );
-    field_sqr  (   L0, a->y );
-    field_add_nr  (   L1,   L2,   L0 );
-    field_add_nr  ( a->t, a->y, a->x );
-    field_sqr  ( a->u, a->t );
-    field_sub_nr  ( a->t, a->u,   L1 );
-    field_bias ( a->t,     3 );
+    field_sqr ( L2, a->x );
+    field_sqr ( L0, a->y );
+    field_add_nr ( L1, L2, L0 );
+    field_add_nr ( a->t, a->y, a->x );
+    field_sqr ( a->u, a->t );
+    field_sub_nr ( a->t, a->u, L1 );
+    field_bias ( a->t, 3 );
     IF32( field_weak_reduce( a->t ) );
-    field_subx_nr ( a->u,   L0,   L2 );
-    field_sqr  ( a->x, a->z );
-    field_bias ( a->x,     2 );
-    field_add_nr  ( a->z, a->x, a->x );
-    field_sub_nr  (   L0, a->z,   L1 );
-    IF32( field_weak_reduce(   L0 ) );
-    field_mul  ( a->z,   L1,   L0 );
-    field_mul  ( a->x,   L0, a->t );
-    field_mul  ( a->y,   L1, a->u );
+    field_subx_nr ( a->u, L0, L2 );
+    field_sqr ( a->x, a->z );
+    field_bias ( a->x, 2 );
+    field_add_nr ( a->z, a->x, a->x );
+    field_sub_nr ( L0, a->z, L1 );
+    IF32( field_weak_reduce( L0 ) );
+    field_mul ( a->z, L1, L0 );
+    field_mul ( a->x, L0, a->t );
+    field_mul ( a->y, L1, a->u );
 }
 
 void
@@ -134,19 +134,19 @@ twist_and_double (
     const struct extensible_t* a
 ) {
     field_a_t L0;
-    field_sqr  ( b->x, a->x );
-    field_sqr  ( b->z, a->y );
+    field_sqr ( b->x, a->x );
+    field_sqr ( b->z, a->y );
     field_add ( b->u, b->x, b->z );
     field_add ( b->t, a->y, a->x );
-    field_sqr  (   L0, b->t );
-    field_sub ( b->t,   L0, b->u );
-    field_sub (   L0, b->z, b->x );
-    field_sqr  ( b->x, a->z );
+    field_sqr ( L0, b->t );
+    field_sub ( b->t, L0, b->u );
+    field_sub ( L0, b->z, b->x );
+    field_sqr ( b->x, a->z );
     field_add ( b->z, b->x, b->x );
     field_sub ( b->y, b->z, b->u );
-    field_mul  ( b->z,   L0, b->y );
-    field_mul  ( b->x, b->y, b->t );
-    field_mul  ( b->y,   L0, b->u );
+    field_mul ( b->z, L0, b->y );
+    field_mul ( b->x, b->y, b->t );
+    field_mul ( b->y, L0, b->u );
 }
 
 void
@@ -155,19 +155,19 @@ untwist_and_double (
     const struct tw_extensible_t* a
 ) {
     field_a_t L0;
-    field_sqr  ( b->x, a->x );
-    field_sqr  ( b->z, a->y );
-    field_add (   L0, b->x, b->z );
+    field_sqr ( b->x, a->x );
+    field_sqr ( b->z, a->y );
+    field_add ( L0, b->x, b->z );
     field_add ( b->t, a->y, a->x );
-    field_sqr  ( b->u, b->t );
-    field_sub ( b->t, b->u,   L0 );
+    field_sqr ( b->u, b->t );
+    field_sub ( b->t, b->u, L0 );
     field_sub ( b->u, b->z, b->x );
-    field_sqr  ( b->x, a->z );
+    field_sqr ( b->x, a->z );
     field_add ( b->z, b->x, b->x );
     field_sub ( b->y, b->z, b->u );
-    field_mul  ( b->z,   L0, b->y );
-    field_mul  ( b->x, b->y, b->t );
-    field_mul  ( b->y,   L0, b->u );
+    field_mul ( b->z, L0, b->y );
+    field_mul ( b->x, b->y, b->t );
+    field_mul ( b->y, L0, b->u );
 }
 
 void
@@ -177,9 +177,9 @@ convert_tw_affine_to_tw_pniels (
 ) {
     field_sub ( b->n.a, a->y, a->x );
     field_add ( b->n.b, a->x, a->y );
-    field_mul  ( b->z, a->y, a->x );
+    field_mul ( b->z, a->y, a->x );
     field_mulw_scc_wr ( b->n.c, b->z, 2*EDWARDS_D-2 );
-    field_set_ui( b->z,     2 );
+    field_set_ui( b->z, 2 );
 }
 
 void
@@ -189,7 +189,7 @@ convert_tw_affine_to_tw_extensible (
 ) {
     field_copy ( b->x, a->x );
     field_copy ( b->y, a->y );
-    field_set_ui( b->z,     1 );
+    field_set_ui( b->z, 1 );
     field_copy ( b->t, a->x );
     field_copy ( b->u, a->y );
 }
@@ -201,7 +201,7 @@ convert_affine_to_extensible (
 ) {
     field_copy ( b->x, a->x );
     field_copy ( b->y, a->y );
-    field_set_ui( b->z,     1 );
+    field_set_ui( b->z, 1 );
     field_copy ( b->t, a->x );
     field_copy ( b->u, a->y );
 }
@@ -213,7 +213,7 @@ convert_tw_extensible_to_tw_pniels (
 ) {
     field_sub ( b->n.a, a->y, a->x );
     field_add ( b->n.b, a->x, a->y );
-    field_mul  ( b->z, a->u, a->t );
+    field_mul ( b->z, a->u, a->t );
     field_mulw_scc_wr ( b->n.c, b->z, 2*EDWARDS_D-2 );
     field_add ( b->z, a->z, a->z );
 }
@@ -225,9 +225,9 @@ convert_tw_pniels_to_tw_extensible (
 ) {
     field_add ( e->u, d->n.b, d->n.a );
     field_sub ( e->t, d->n.b, d->n.a );
-    field_mul  ( e->x, d->z, e->t );
-    field_mul  ( e->y, d->z, e->u );
-    field_sqr  ( e->z, d->z );
+    field_mul ( e->x, d->z, e->t );
+    field_mul ( e->y, d->z, e->u );
+    field_sqr ( e->z, d->z );
 }
 
 void
@@ -237,7 +237,7 @@ convert_tw_niels_to_tw_extensible (
 ) {
     field_add ( e->y, d->b, d->a );
     field_sub ( e->x, d->b, d->a );
-    field_set_ui( e->z,     1 );
+    field_set_ui( e->z, 1 );
     field_copy ( e->t, e->x );
     field_copy ( e->u, e->y );
 }
@@ -248,26 +248,26 @@ montgomery_step (
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
-    field_add_nr  (   L0, a->zd, a->xd );
-    field_sub (   L1, a->xd, a->zd );
+    field_add_nr ( L0, a->zd, a->xd );
+    field_sub ( L1, a->xd, a->zd );
     field_sub ( a->zd, a->xa, a->za );
-    field_mul  ( a->xd,   L0, a->zd );
-    field_add_nr  ( a->zd, a->za, a->xa );
-    field_mul  ( a->za,   L1, a->zd );
-    field_add_nr  ( a->xa, a->za, a->xd );
-    field_sqr  ( a->zd, a->xa );
-    field_mul  ( a->xa, a->z0, a->zd );
+    field_mul ( a->xd, L0, a->zd );
+    field_add_nr ( a->zd, a->za, a->xa );
+    field_mul ( a->za, L1, a->zd );
+    field_add_nr ( a->xa, a->za, a->xd );
+    field_sqr ( a->zd, a->xa );
+    field_mul ( a->xa, a->z0, a->zd );
     field_sub ( a->zd, a->xd, a->za );
-    field_sqr  ( a->za, a->zd );
-    field_sqr  ( a->xd,   L0 );
-    field_sqr  (   L0,   L1 );
+    field_sqr ( a->za, a->zd );
+    field_sqr ( a->xd, L0 );
+    field_sqr ( L0, L1 );
     field_mulw_scc ( a->zd, a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
-    field_sub (   L1, a->xd,   L0 );
-    field_mul  ( a->xd,   L0, a->zd );
-    field_sub_nr  (   L0, a->zd,   L1 );
-    field_bias (   L0,     4 - 2*is32 /*is32 ? 2 : 4*/ );
-    IF32( field_weak_reduce(   L0 ) );
-    field_mul  ( a->zd,   L0,   L1 );
+    field_sub ( L1, a->xd, L0 );
+    field_mul ( a->xd, L0, a->zd );
+    field_sub_nr ( L0, a->zd, L1 );
+    field_bias ( L0, 4 - 2*is32 /*is32 ? 2 : 4*/ );
+    IF32( field_weak_reduce( L0 ) );
+    field_mul ( a->zd, L0, L1 );
 }
 
 void
@@ -275,10 +275,10 @@ deserialize_montgomery (
     struct montgomery_t* a,
     const field_a_t sbz
 ) {
-    field_sqr  ( a->z0,   sbz );
-    field_set_ui( a->xd,     1 );
-    field_set_ui( a->zd,     0 );
-    field_set_ui( a->xa,     1 );
+    field_sqr ( a->z0, sbz );
+    field_set_ui( a->xd, 1 );
+    field_set_ui( a->zd, 0 );
+    field_set_ui( a->xa, 1 );
     field_copy ( a->za, a->z0 );
 }
 
@@ -290,42 +290,42 @@ serialize_montgomery (
 ) {
     mask_t L4, L5, L6;
     field_a_t L0, L1, L2, L3;
-    field_mul  (   L3, a->z0, a->zd );
-    field_sub (   L1,   L3, a->xd );
-    field_mul  (   L3, a->za,   L1 );
-    field_mul  (   L2, a->z0, a->xd );
-    field_sub (   L1,   L2, a->zd );
-    field_mul  (   L0, a->xa,   L1 );
-    field_add (   L2,   L0,   L3 );
-    field_sub (   L1,   L3,   L0 );
-    field_mul  (   L3,   L1,   L2 );
-    field_copy (   L2, a->z0 );
-    field_addw (   L2,     1 );
-    field_sqr  (   L0,   L2 );
-    field_mulw_scc_wr (   L1,   L0, EDWARDS_D-1 );
-    field_add (   L2, a->z0, a->z0 );
-    field_add (   L0,   L2,   L2 );
-    field_add (   L2,   L0,   L1 );
-    field_mul  (   L0, a->xd,   L2 );
-       L5 = field_is_zero( a->zd );
-       L6 = -   L5;
-    constant_time_mask (   L1,   L0, sizeof(L1), L5 );
-    field_add (   L2,   L1, a->zd );
-       L4 = ~   L5;
-    field_mul  (   L1,   sbz,   L3 );
-    field_addw (   L1,    L6 );
-    field_mul  (   L3,   L2,   L1 );
-    field_mul  (   L1,   L3,   L2 );
-    field_mul  (   L2,   L3, a->xd );
-    field_mul  (   L3,   L1,   L2 );
-    field_isr  (   L0,   L3 );
-    field_mul  (   L2,   L1,   L0 );
-    field_sqr  (   L1,   L0 );
-    field_mul  (   L0,   L3,   L1 );
-    constant_time_mask (     b,   L2, sizeof(L1), L4 );
-    field_subw(   L0,     1 );
-       L5 = field_is_zero(   L0 );
-       L4 = field_is_zero(   sbz );
+    field_mul ( L3, a->z0, a->zd );
+    field_sub ( L1, L3, a->xd );
+    field_mul ( L3, a->za, L1 );
+    field_mul ( L2, a->z0, a->xd );
+    field_sub ( L1, L2, a->zd );
+    field_mul ( L0, a->xa, L1 );
+    field_add ( L2, L0, L3 );
+    field_sub ( L1, L3, L0 );
+    field_mul ( L3, L1, L2 );
+    field_copy ( L2, a->z0 );
+    field_addw ( L2, 1 );
+    field_sqr ( L0, L2 );
+    field_mulw_scc_wr ( L1, L0, EDWARDS_D-1 );
+    field_add ( L2, a->z0, a->z0 );
+    field_add ( L0, L2, L2 );
+    field_add ( L2, L0, L1 );
+    field_mul ( L0, a->xd, L2 );
+    L5 = field_is_zero( a->zd );
+    L6 = -   L5;
+    constant_time_mask ( L1, L0, sizeof(L1), L5 );
+    field_add ( L2, L1, a->zd );
+    L4 = ~   L5;
+    field_mul ( L1, sbz, L3 );
+    field_addw ( L1, L6 );
+    field_mul ( L3, L2, L1 );
+    field_mul ( L1, L3, L2 );
+    field_mul ( L2, L3, a->xd );
+    field_mul ( L3, L1, L2 );
+    field_isr ( L0, L3 );
+    field_mul ( L2, L1, L0 );
+    field_sqr ( L1, L0 );
+    field_mul ( L0, L3, L1 );
+    constant_time_mask ( b, L2, sizeof(L1), L4 );
+    field_subw( L0, 1 );
+    L5 = field_is_zero( L0 );
+    L4 = field_is_zero( sbz );
     return    L5 |    L4;
 }
 
@@ -335,17 +335,17 @@ serialize_extensible (
     const struct extensible_t* a
 ) {
     field_a_t L0, L1, L2;
-    field_sub (   L0, a->y, a->z );
-    field_add (     b, a->z, a->y );
-    field_mul  (   L1, a->z, a->x );
-    field_mul  (   L2,   L0,   L1 );
-    field_mul  (   L1,   L2,   L0 );
-    field_mul  (   L0,   L2,     b );
-    field_mul  (   L2,   L1,   L0 );
-    field_isr  (   L0,   L2 );
-    field_mul  (     b,   L1,   L0 );
-    field_sqr  (   L1,   L0 );
-    field_mul  (   L0,   L2,   L1 );
+    field_sub ( L0, a->y, a->z );
+    field_add ( b, a->z, a->y );
+    field_mul ( L1, a->z, a->x );
+    field_mul ( L2, L0, L1 );
+    field_mul ( L1, L2, L0 );
+    field_mul ( L0, L2, b );
+    field_mul ( L2, L1, L0 );
+    field_isr ( L0, L2 );
+    field_mul ( b, L1, L0 );
+    field_sqr ( L1, L0 );
+    field_mul ( L0, L2, L1 );
 }
 
 void
@@ -354,23 +354,23 @@ untwist_and_double_and_serialize (
     const struct tw_extensible_t* a
 ) {
     field_a_t L0, L1, L2, L3;
-    field_mul  (   L3, a->y, a->x );
-    field_add (     b, a->y, a->x );
-    field_sqr  (   L1,     b );
-    field_add (   L2,   L3,   L3 );
-    field_sub (     b,   L1,   L2 );
-    field_sqr  (   L2, a->z );
-    field_sqr  (   L1,   L2 );
-    field_add (   b,     b,     b );
-    field_mulw_scc (     L2,   b, EDWARDS_D-1 );
-    field_mulw_scc (   b,   L2, EDWARDS_D-1 );
-    field_mul  (   L0,   L2,   L1 );
-    field_mul  (   L2,     b,   L0 );
-    field_isr  (   L0,   L2 );
-    field_mul  (   L1,     b,   L0 );
-    field_sqr  (     b,   L0 );
-    field_mul  (   L0,   L2,     b );
-    field_mul  (     b,   L1,   L3 );
+    field_mul ( L3, a->y, a->x );
+    field_add ( b, a->y, a->x );
+    field_sqr ( L1, b );
+    field_add ( L2, L3, L3 );
+    field_sub ( b, L1, L2 );
+    field_sqr ( L2, a->z );
+    field_sqr ( L1, L2 );
+    field_add ( b, b, b );
+    field_mulw_scc ( L2, b, EDWARDS_D-1 );
+    field_mulw_scc ( b, L2, EDWARDS_D-1 );
+    field_mul ( L0, L2, L1 );
+    field_mul ( L2, b, L0 );
+    field_isr ( L0, L2 );
+    field_mul ( L1, b, L0 );
+    field_sqr ( b, L0 );
+    field_mul ( L0, L2, b );
+    field_mul ( b, L1, L3 );
 }
 
 void
@@ -378,23 +378,23 @@ twist_even (
     struct tw_extensible_t*    b,
     const struct extensible_t* a
 ) {
-    field_sqr  ( b->y, a->z );
-    field_sqr  ( b->z, a->x );
+    field_sqr ( b->y, a->z );
+    field_sqr ( b->z, a->x );
     field_sub ( b->u, b->y, b->z );
     field_sub ( b->z, a->z, a->x );
-    field_mul  ( b->y, b->z, a->y );
+    field_mul ( b->y, b->z, a->y );
     field_sub ( b->z, a->z, a->y );
-    field_mul  ( b->x, b->z, b->y );
-    field_mul  ( b->t, b->x, b->u );
-    field_mul  ( b->y, b->x, b->t );
-    field_isr  ( b->t, b->y );
-    field_mul  ( b->u, b->x, b->t );
-    field_sqr  ( b->x, b->t );
-    field_mul  ( b->t, b->y, b->x );
-    field_mul  ( b->x, a->x, b->u );
-    field_mul  ( b->y, a->y, b->u );
-    field_addw ( b->y,    -field_is_zero( b->z ) );
-    field_set_ui( b->z,     1 );
+    field_mul ( b->x, b->z, b->y );
+    field_mul ( b->t, b->x, b->u );
+    field_mul ( b->y, b->x, b->t );
+    field_isr ( b->t, b->y );
+    field_mul ( b->u, b->x, b->t );
+    field_sqr ( b->x, b->t );
+    field_mul ( b->t, b->y, b->x );
+    field_mul ( b->x, a->x, b->u );
+    field_mul ( b->y, a->y, b->u );
+    field_addw ( b->y, -field_is_zero( b->z ) );
+    field_set_ui( b->z, 1 );
     field_copy ( b->t, b->x );
     field_copy ( b->u, b->y );
 }
@@ -405,29 +405,29 @@ test_only_twist (
     const struct extensible_t* a
 ) {
     field_a_t L0, L1;
-    field_sqr  ( b->u, a->z );
-    field_sqr  ( b->y, a->x );
+    field_sqr ( b->u, a->z );
+    field_sqr ( b->y, a->x );
     field_sub ( b->z, b->u, b->y );
     field_add ( b->y, b->z, b->z );
     field_add ( b->u, b->y, b->y );
     field_sub ( b->y, a->z, a->x );
-    field_mul  ( b->x, b->y, a->y );
+    field_mul ( b->x, b->y, a->y );
     field_sub ( b->z, a->z, a->y );
-    field_mul  ( b->t, b->z, b->x );
-    field_mul  (   L1, b->t, b->u );
-    field_mul  ( b->x, b->t,   L1 );
-    field_isr  (   L0, b->x );
-    field_mul  ( b->u, b->t,   L0 );
-    field_sqr  (   L1,   L0 );
-    field_mul  ( b->t, b->x,   L1 );
-    field_add (   L1, a->y, a->x );
-    field_sub (   L0, a->x, a->y );
-    field_mul  ( b->x, b->t,   L0 );
-    field_add (   L0, b->x,   L1 );
-    field_sub ( b->t,   L1, b->x );
-    field_mul  ( b->x,   L0, b->u );
+    field_mul ( b->t, b->z, b->x );
+    field_mul ( L1, b->t, b->u );
+    field_mul ( b->x, b->t, L1 );
+    field_isr ( L0, b->x );
+    field_mul ( b->u, b->t, L0 );
+    field_sqr ( L1, L0 );
+    field_mul ( b->t, b->x, L1 );
+    field_add ( L1, a->y, a->x );
+    field_sub ( L0, a->x, a->y );
+    field_mul ( b->x, b->t, L0 );
+    field_add ( L0, b->x, L1 );
+    field_sub ( b->t, L1, b->x );
+    field_mul ( b->x, L0, b->u );
     field_addw ( b->x, -field_is_zero( b->y ) );
-    field_mul  ( b->y, b->t, b->u );
+    field_mul ( b->y, b->t, b->u );
     field_addw ( b->y, -field_is_zero( b->z ) );
     field_set_ui( b->z, 1+field_is_zero( a->y ) );
     field_copy ( b->t, b->x );
@@ -439,10 +439,10 @@ is_even_pt (
     const struct extensible_t* a
 ) {
     field_a_t L0, L1, L2;
-    field_sqr  (   L2, a->z );
-    field_sqr  (   L1, a->x );
-    field_sub (   L0,   L2,   L1 );
-    return field_is_square (   L0 );
+    field_sqr ( L2, a->z );
+    field_sqr ( L1, a->x );
+    field_sub ( L0, L2, L1 );
+    return field_is_square ( L0 );
 }
 
 mask_t
@@ -450,10 +450,10 @@ is_even_tw (
     const struct tw_extensible_t* a
 ) {
     field_a_t L0, L1, L2;
-    field_sqr  (   L2, a->z );
-    field_sqr  (   L1, a->x );
-    field_add (   L0,   L1,   L2 );
-    return field_is_square (   L0 );
+    field_sqr ( L2, a->z );
+    field_sqr ( L1, a->x );
+    field_add ( L0, L1, L2 );
+    return field_is_square ( L0 );
 }
 
 mask_t
@@ -462,32 +462,32 @@ deserialize_affine (
     const field_a_t sz
 ) {
     field_a_t L0, L1, L2, L3;
-    field_sqr  (   L1,    sz );
-    field_copy (   L3,   L1 );
-    field_addw (   L3,     1 );
-    field_sqr  (   L2,   L3 );
-    field_mulw_scc (   a->x,   L2, EDWARDS_D-1 ); /* PERF MULW */
-    field_add (   L3,   L1,   L1 ); /* FIXME: i adjusted the bias here, was it right? */
-    field_add ( a->y,   L3,   L3 );
-    field_add (   L3, a->y, a->x );
-    field_copy ( a->y,   L1 );
+    field_sqr ( L1, sz );
+    field_copy ( L3, L1 );
+    field_addw ( L3, 1 );
+    field_sqr ( L2, L3 );
+    field_mulw_scc ( a->x, L2, EDWARDS_D-1 ); /* PERF MULW */
+    field_add ( L3, L1, L1 ); /* FIXME: i adjusted the bias here, was it right? */
+    field_add ( a->y, L3, L3 );
+    field_add ( L3, a->y, a->x );
+    field_copy ( a->y, L1 );
     field_neg ( a->x, a->y );
-    field_addw ( a->x,     1 );
-    field_mul  ( a->y, a->x,   L3 );
-    field_sqr  (   L2, a->x );
-    field_mul  (   L0,   L2, a->y );
-    field_mul  ( a->y, a->x,   L0 );
-    field_isr  (   L3, a->y );
-    field_mul  ( a->y,   L2,   L3 );
-    field_sqr  (   L2,   L3 );
-    field_mul  (   L3,   L0,   L2 );
-    field_mul  (   L0, a->x,   L3 );
-    field_add (   L2, a->y, a->y );
-    field_mul  ( a->x,    sz,   L2 );
-    field_addw (   L1,     1 );
-    field_mul  ( a->y,   L1,   L3 );
-    field_subw(   L0,     1 );
-    return field_is_zero(   L0 );
+    field_addw ( a->x, 1 );
+    field_mul ( a->y, a->x, L3 );
+    field_sqr ( L2, a->x );
+    field_mul ( L0, L2, a->y );
+    field_mul ( a->y, a->x, L0 );
+    field_isr ( L3, a->y );
+    field_mul ( a->y, L2, L3 );
+    field_sqr ( L2, L3 );
+    field_mul ( L3, L0, L2 );
+    field_mul ( L0, a->x, L3 );
+    field_add ( L2, a->y, a->y );
+    field_mul ( a->x, sz, L2 );
+    field_addw ( L1, 1 );
+    field_mul ( a->y, L1, L3 );
+    field_subw( L0, 1 );
+    return field_is_zero( L0 );
 }
 
 mask_t
@@ -496,40 +496,40 @@ deserialize_and_twist_approx (
     const field_a_t    sz
 ) {
     field_a_t L0, L1;
-    field_sqr  ( a->z,    sz );
+    field_sqr ( a->z, sz );
     field_copy ( a->y, a->z );
-    field_addw ( a->y,     1 );
-    field_sqr  ( L0, a->y );
+    field_addw ( a->y, 1 );
+    field_sqr ( L0, a->y );
     field_mulw_scc ( a->x, L0, EDWARDS_D-1 );
     field_add ( a->y, a->z, a->z );
     field_add ( a->u, a->y, a->y );
     field_add ( a->y, a->u, a->x );
-    field_sqr  ( a->x, a->z );
+    field_sqr ( a->x, a->z );
     field_neg ( a->u, a->x );
-    field_addw ( a->u,     1 );
-    field_mul  ( a->x,  sqrt_d_minus_1, a->u );
-    field_mul  (   L0, a->x, a->y );
-    field_mul  ( a->t,   L0, a->y );
-    field_mul  ( a->u, a->x, a->t );
-    field_mul  ( a->t, a->u,   L0 );
-    field_mul  ( a->y, a->x, a->t );
-    field_isr  (   L0, a->y );
-    field_mul  ( a->y, a->u,   L0 );
-    field_sqr  (   L1,   L0 );
-    field_mul  ( a->u, a->t,   L1 );
-    field_mul  ( a->t, a->x, a->u );
-    field_add ( a->x,    sz,    sz );
-    field_mul  (   L0, a->u, a->x );
+    field_addw ( a->u, 1 );
+    field_mul ( a->x, sqrt_d_minus_1, a->u );
+    field_mul ( L0, a->x, a->y );
+    field_mul ( a->t, L0, a->y );
+    field_mul ( a->u, a->x, a->t );
+    field_mul ( a->t, a->u, L0 );
+    field_mul ( a->y, a->x, a->t );
+    field_isr ( L0, a->y );
+    field_mul ( a->y, a->u, L0 );
+    field_sqr ( L1, L0 );
+    field_mul ( a->u, a->t, L1 );
+    field_mul ( a->t, a->x, a->u );
+    field_add ( a->x, sz, sz );
+    field_mul ( L0, a->u, a->x );
     field_copy ( a->x, a->z );
-    field_neg (   L1, a->x );
-    field_addw (   L1,     1 );
-    field_mul  ( a->x,   L1,   L0 );
-    field_mul  (   L0, a->u, a->y );
-    field_addw ( a->z,     1 );
-    field_mul  ( a->y, a->z,   L0 );
-    field_subw( a->t,     1 );
+    field_neg ( L1, a->x );
+    field_addw ( L1, 1 );
+    field_mul ( a->x, L1, L0 );
+    field_mul ( L0, a->u, a->y );
+    field_addw ( a->z, 1 );
+    field_mul ( a->y, a->z, L0 );
+    field_subw( a->t, 1 );
     mask_t ret = field_is_zero( a->t );
-    field_set_ui( a->z,     1 );
+    field_set_ui( a->z, 1 );
     field_copy ( a->t, a->x );
     field_copy ( a->u, a->y );
     return ret;
@@ -539,30 +539,30 @@ void
 set_identity_extensible (
     struct extensible_t* a
 ) {
-    field_set_ui( a->x,     0 );
-    field_set_ui( a->y,     1 );
-    field_set_ui( a->z,     1 );
-    field_set_ui( a->t,     0 );
-    field_set_ui( a->u,     0 );
+    field_set_ui( a->x, 0 );
+    field_set_ui( a->y, 1 );
+    field_set_ui( a->z, 1 );
+    field_set_ui( a->t, 0 );
+    field_set_ui( a->u, 0 );
 }
 
 void
 set_identity_tw_extensible (
     struct tw_extensible_t* a
 ) {
-    field_set_ui( a->x,     0 );
-    field_set_ui( a->y,     1 );
-    field_set_ui( a->z,     1 );
-    field_set_ui( a->t,     0 );
-    field_set_ui( a->u,     0 );
+    field_set_ui( a->x, 0 );
+    field_set_ui( a->y, 1 );
+    field_set_ui( a->z, 1 );
+    field_set_ui( a->t, 0 );
+    field_set_ui( a->u, 0 );
 }
 
 void
 set_identity_affine (
     struct affine_t* a
 ) {
-    field_set_ui( a->x,     0 );
-    field_set_ui( a->y,     1 );
+    field_set_ui( a->x, 0 );
+    field_set_ui( a->y, 1 );
 }
 
 mask_t
@@ -572,10 +572,10 @@ eq_affine (
 ) {
     mask_t L1, L2;
     field_a_t L0;
-    field_sub (   L0, a->x, b->x );
-       L2 = field_is_zero(   L0 );
-    field_sub (   L0, a->y, b->y );
-       L1 = field_is_zero(   L0 );
+    field_sub ( L0, a->x, b->x );
+    L2 = field_is_zero( L0 );
+    field_sub ( L0, a->y, b->y );
+    L1 = field_is_zero( L0 );
     return    L2 &    L1;
 }
 
@@ -586,14 +586,14 @@ eq_extensible (
 ) {
     mask_t L3, L4;
     field_a_t L0, L1, L2;
-    field_mul  (   L2, b->z, a->x );
-    field_mul  (   L1, a->z, b->x );
-    field_sub (   L0,   L2,   L1 );
-       L4 = field_is_zero(   L0 );
-    field_mul  (   L2, b->z, a->y );
-    field_mul  (   L1, a->z, b->y );
-    field_sub (   L0,   L2,   L1 );
-       L3 = field_is_zero(   L0 );
+    field_mul ( L2, b->z, a->x );
+    field_mul ( L1, a->z, b->x );
+    field_sub ( L0, L2, L1 );
+    L4 = field_is_zero( L0 );
+    field_mul ( L2, b->z, a->y );
+    field_mul ( L1, a->z, b->y );
+    field_sub ( L0, L2, L1 );
+    L3 = field_is_zero( L0 );
     return    L4 &    L3;
 }
 
@@ -604,14 +604,14 @@ eq_tw_extensible (
 ) {
     mask_t L3, L4;
     field_a_t L0, L1, L2;
-    field_mul  (   L2, b->z, a->x );
-    field_mul  (   L1, a->z, b->x );
-    field_sub (   L0,   L2,   L1 );
-       L4 = field_is_zero(   L0 );
-    field_mul  (   L2, b->z, a->y );
-    field_mul  (   L1, a->z, b->y );
-    field_sub (   L0,   L2,   L1 );
-       L3 = field_is_zero(   L0 );
+    field_mul ( L2, b->z, a->x );
+    field_mul ( L1, a->z, b->x );
+    field_sub ( L0, L2, L1 );
+    L4 = field_is_zero( L0 );
+    field_mul ( L2, b->z, a->y );
+    field_mul ( L1, a->z, b->y );
+    field_sub ( L0, L2, L1 );
+    L3 = field_is_zero( L0 );
     return    L4 &    L3;
 }
 
@@ -621,21 +621,21 @@ elligator_2s_inject (
     const field_a_t r
 ) {
     field_a_t L2, L3, L4, L5, L6, L7, L8;
-    field_sqr  ( a->x,     r );
-    field_sqr  (   L3, a->x );
-    field_copy ( a->y,   L3 );
-    field_neg (   L4, a->y );
-    field_addw (   L4,     1 );
-    field_sqr  (   L2,   L4 );
-    field_mulw (   L7,   L2, (EDWARDS_D-1)*(EDWARDS_D-1) );
-    field_mulw (   L8,   L3, 4*(EDWARDS_D+1)*(EDWARDS_D+1) );
-    field_add ( a->y,   L8,   L7 );
-    field_mulw (   L8,   L2, 4*(EDWARDS_D)*(EDWARDS_D-1) );
-    field_sub (   L7, a->y,   L8 );
-    field_mulw_scc (   L6, a->y, -2-2*EDWARDS_D );
-    field_mul  (   L5,   L7,   L6 );
+    field_sqr ( a->x, r );
+    field_sqr ( L3, a->x );
+    field_copy ( a->y, L3 );
+    field_neg ( L4, a->y );
+    field_addw ( L4, 1 );
+    field_sqr ( L2, L4 );
+    field_mulw ( L7, L2, (EDWARDS_D-1)*(EDWARDS_D-1) );
+    field_mulw ( L8, L3, 4*(EDWARDS_D+1)*(EDWARDS_D+1) );
+    field_add ( a->y, L8, L7 );
+    field_mulw ( L8, L2, 4*(EDWARDS_D)*(EDWARDS_D-1) );
+    field_sub ( L7, a->y, L8 );
+    field_mulw_scc ( L6, a->y, -2-2*EDWARDS_D );
+    field_mul ( L5, L7, L6 );
         /* FIXME Stability problem (API stability, not crash) / possible bug.
-         * change to: p448_mul  (   L5,   L7,   L4 ); ?
+         * change to: p448_mul ( L5, L7, L4 ); ?
          * This isn't a deep change: it's for sign adjustment.
          * Need to check which one leads to the correct sign, probably by writig
          * the invert routine.
@@ -646,32 +646,32 @@ elligator_2s_inject (
          * Could compute be, (be)^2, (be)^3, a b^3 e^3, a b^3 e^4. = 4M+S
          * instead of 6M.
          */
-    field_mul  (   L8,   L5,   L4 );
-    field_mul  (   L4,   L5,   L6 );
-    field_mul  (   L5,   L7,   L8 );
-    field_mul  (   L8,   L5,   L4 );
-    field_mul  (   L4,   L7,   L8 );
-    field_isr  (   L6,   L4 );
-    field_mul  (   L4,   L5,   L6 );
-    field_sqr  (   L5,   L6 );
-    field_mul  (   L6,   L8,   L5 );
-    field_mul  (   L8,   L7,   L6 );
-    field_mul  (   L7,   L8,   L6 );
-    field_copy (   L6, a->x );
-    field_addw ( a->x,     1 );
-    field_mul  (   L5, a->x,   L8 );
-    field_addw (   L5,     1 );
-    field_sub ( a->x,   L6,   L5 );
-    field_mul  (   L5,   L4, a->x );
-    field_mulw_scc_wr (   a->x,   L5, -2-2*EDWARDS_D );
-    field_add (   L4,   L3,   L3 );
-    field_add (   L3,   L4,   L2 );
-    field_subw(   L3,     2 );
-    field_mul  (   L2,   L3,   L8 );
-    field_mulw (   L3,   L2, 2*(EDWARDS_D+1)*(EDWARDS_D-1) );
-    field_add (   L2,   L3, a->y );
-    field_mul  ( a->y,   L7,   L2 );
-    field_addw ( a->y,    -field_is_zero( L8 ) );
+    field_mul ( L8, L5, L4 );
+    field_mul ( L4, L5, L6 );
+    field_mul ( L5, L7, L8 );
+    field_mul ( L8, L5, L4 );
+    field_mul ( L4, L7, L8 );
+    field_isr ( L6, L4 );
+    field_mul ( L4, L5, L6 );
+    field_sqr ( L5, L6 );
+    field_mul ( L6, L8, L5 );
+    field_mul ( L8, L7, L6 );
+    field_mul ( L7, L8, L6 );
+    field_copy ( L6, a->x );
+    field_addw ( a->x, 1 );
+    field_mul ( L5, a->x, L8 );
+    field_addw ( L5, 1 );
+    field_sub ( a->x, L6, L5 );
+    field_mul ( L5, L4, a->x );
+    field_mulw_scc_wr ( a->x, L5, -2-2*EDWARDS_D );
+    field_add ( L4, L3, L3 );
+    field_add ( L3, L4, L2 );
+    field_subw( L3, 2 );
+    field_mul ( L2, L3, L8 );
+    field_mulw ( L3, L2, 2*(EDWARDS_D+1)*(EDWARDS_D-1) );
+    field_add ( L2, L3, a->y );
+    field_mul ( a->y, L7, L2 );
+    field_addw ( a->y, -field_is_zero( L8 ) );
 }
 
 mask_t
@@ -679,14 +679,14 @@ validate_affine (
     const struct affine_t* a
 ) {
     field_a_t L0, L1, L2, L3;
-    field_sqr  (   L0, a->y );
-    field_sqr  (   L1, a->x );
-    field_add (   L3,   L1,   L0 );
-    field_mulw_scc (   L2,   L1, EDWARDS_D );
-    field_mul  (   L1,   L0,   L2 );
-    field_addw (   L1,     1 );
-    field_sub (   L0,   L3,   L1 );
-    return field_is_zero(   L0 );
+    field_sqr ( L0, a->y );
+    field_sqr ( L1, a->x );
+    field_add ( L3, L1, L0 );
+    field_mulw_scc ( L2, L1, EDWARDS_D );
+    field_mul ( L1, L0, L2 );
+    field_addw ( L1, 1 );
+    field_sub ( L0, L3, L1 );
+    return field_is_zero( L0 );
 }
 
 mask_t
@@ -699,30 +699,30 @@ validate_tw_extensible (
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    field_mul  (   L1, ext->t, ext->u );
-    field_mul  (   L2, ext->z,   L1 );
-    field_mul  (   L0, ext->x, ext->y );
-    field_neg (   L1,   L0 );
-    field_add (   L0,   L1,   L2 );
-       L5 = field_is_zero(   L0 );
+    field_mul ( L1, ext->t, ext->u );
+    field_mul ( L2, ext->z, L1 );
+    field_mul ( L0, ext->x, ext->y );
+    field_neg ( L1, L0 );
+    field_add ( L0, L1, L2 );
+    L5 = field_is_zero( L0 );
     /*
      * Check invariant:
      * 0 = d*t^2*u^2 + x^2 - y^2 + z^2 - t^2*u^2
      */
-    field_sqr  (   L2, ext->y );
-    field_neg (   L1,   L2 );
-    field_sqr  (   L0, ext->x );
-    field_add (   L2,   L0,   L1 );
-    field_sqr  (   L3, ext->u );
-    field_sqr  (   L0, ext->t );
-    field_mul  (   L1,   L0,   L3 );
-    field_mulw_scc (   L3,   L1, EDWARDS_D );
-    field_add (   L0,   L3,   L2 );
-    field_neg (   L3,   L1 );
-    field_add (   L2,   L3,   L0 );
-    field_sqr  (   L1, ext->z );
-    field_add (   L0,   L1,   L2 );
-       L4 = field_is_zero(   L0 );
+    field_sqr ( L2, ext->y );
+    field_neg ( L1, L2 );
+    field_sqr ( L0, ext->x );
+    field_add ( L2, L0, L1 );
+    field_sqr ( L3, ext->u );
+    field_sqr ( L0, ext->t );
+    field_mul ( L1, L0, L3 );
+    field_mulw_scc ( L3, L1, EDWARDS_D );
+    field_add ( L0, L3, L2 );
+    field_neg ( L3, L1 );
+    field_add ( L2, L3, L0 );
+    field_sqr ( L1, ext->z );
+    field_add ( L0, L1, L2 );
+    L4 = field_is_zero( L0 );
     return    L5 & L4 &~ field_is_zero(ext->z);
 }
 
@@ -736,28 +736,28 @@ validate_extensible (
      * Check invariant:
      * 0 = d*t^2*u^2 - x^2 - y^2 + z^2
      */
-    field_sqr  (   L2, ext->y );
-    field_neg (   L1,   L2 );
-    field_sqr  (   L0, ext->z );
-    field_add (   L2,   L0,   L1 );
-    field_sqr  (   L3, ext->u );
-    field_sqr  (   L0, ext->t );
-    field_mul  (   L1,   L0,   L3 );
-    field_mulw_scc (   L0,   L1, EDWARDS_D );
-    field_add (   L1,   L0,   L2 );
-    field_sqr  (   L0, ext->x );
-    field_neg (   L2,   L0 );
-    field_add (   L0,   L2,   L1 );
-       L5 = field_is_zero(   L0 );
+    field_sqr ( L2, ext->y );
+    field_neg ( L1, L2 );
+    field_sqr ( L0, ext->z );
+    field_add ( L2, L0, L1 );
+    field_sqr ( L3, ext->u );
+    field_sqr ( L0, ext->t );
+    field_mul ( L1, L0, L3 );
+    field_mulw_scc ( L0, L1, EDWARDS_D );
+    field_add ( L1, L0, L2 );
+    field_sqr ( L0, ext->x );
+    field_neg ( L2, L0 );
+    field_add ( L0, L2, L1 );
+    L5 = field_is_zero( L0 );
     /*
      * Check invariant:
      * 0 = -x*y + z*t*u
      */
-    field_mul  (   L1, ext->t, ext->u );
-    field_mul  (   L2, ext->z,   L1 );
-    field_mul  (   L0, ext->x, ext->y );
-    field_neg (   L1,   L0 );
-    field_add (   L0,   L1,   L2 );
-       L4 = field_is_zero(   L0 );
+    field_mul ( L1, ext->t, ext->u );
+    field_mul ( L2, ext->z, L1 );
+    field_mul ( L0, ext->x, ext->y );
+    field_neg ( L1, L0 );
+    field_add ( L0, L1, L2 );
+    L4 = field_is_zero( L0 );
     return L5 & L4 &~ field_is_zero(ext->z);
 }

From 1f1836de1278c922f536ae3c59ba1883e5068f4c Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 17:21:20 -0800
Subject: [PATCH 05/15] gmp-style foo_t[1] for points too

---
 src/crandom.c           |   8 +-
 src/ec_point.c          | 124 ++++++++---------
 src/goldilocks.c        | 128 ++++++++---------
 src/include/crandom.h   |   9 +-
 src/include/ec_point.h  | 198 +++++++++++++-------------
 src/include/magic.h     |   2 +-
 src/include/scalarmul.h |  28 ++--
 src/include/sha512.h    |  10 +-
 src/p448/magic.c        |   4 +-
 src/p480/magic.c        |   4 +-
 src/p521/magic.c        |   4 +-
 src/scalarmul.c         | 298 ++++++++++++++++++++--------------------
 src/sha512.c            |   8 +-
 test/bench.c            |  10 +-
 test/test_pointops.c    |   4 +-
 test/test_scalarmul.c   |   4 +-
 test/test_sha512.c      |  12 +-
 17 files changed, 428 insertions(+), 427 deletions(-)

diff --git a/src/crandom.c b/src/crandom.c
index da0c3c9..4f36644 100644
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -301,7 +301,7 @@ crandom_chacha_expand(u_int64_t iv,
 
 int
 crandom_init_from_file(
-    struct crandom_state_t *state,
+    crandom_state_a_t state,
     const char *filename,
     int reseed_interval,
     int reseeds_mandatory
@@ -338,7 +338,7 @@ crandom_init_from_file(
 
 void
 crandom_init_from_buffer(
-    struct crandom_state_t *state,
+    crandom_state_a_t state,
     const char initial_seed[32]
 ) {
     memcpy(state->seed, initial_seed, 32);
@@ -350,7 +350,7 @@ crandom_init_from_buffer(
 
 int
 crandom_generate(
-    struct crandom_state_t *state,
+    crandom_state_a_t state,
     unsigned char *output,
     unsigned long long length
 ) {
@@ -475,7 +475,7 @@ crandom_generate(
 
 void
 crandom_destroy(
-    struct crandom_state_t *state
+    crandom_state_a_t state
 ) { 
     if (state->magic == CRANDOM_MAGIC && state->randomfd) {
         (void) close(state->randomfd);
diff --git a/src/ec_point.c b/src/ec_point.c
index e78852b..2582372 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -14,8 +14,8 @@
 
 void
 add_tw_niels_to_tw_extensible (
-    struct tw_extensible_t*  d,
-    const struct tw_niels_t* e
+    tw_extensible_a_t  d,
+    const tw_niels_a_t e
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
@@ -36,8 +36,8 @@ add_tw_niels_to_tw_extensible (
 
 void
 sub_tw_niels_from_tw_extensible (
-    struct tw_extensible_t*  d,
-    const struct tw_niels_t* e
+    tw_extensible_a_t  d,
+    const tw_niels_a_t e
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
@@ -58,29 +58,29 @@ sub_tw_niels_from_tw_extensible (
 
 void
 add_tw_pniels_to_tw_extensible (
-    struct tw_extensible_t*   e,
-    const struct tw_pniels_t* a
+    tw_extensible_a_t   e,
+    const tw_pniels_a_t a
 ) {
     field_a_t L0;
     field_mul ( L0, e->z, a->z );
     field_copy ( e->z, L0 );
-    add_tw_niels_to_tw_extensible( e, &a->n );
+    add_tw_niels_to_tw_extensible( e, a->n );
 }
 
 void
 sub_tw_pniels_from_tw_extensible (
-    struct tw_extensible_t*   e,
-    const struct tw_pniels_t* a
+    tw_extensible_a_t   e,
+    const tw_pniels_a_t a
 ) {
     field_a_t L0;
     field_mul ( L0, e->z, a->z );
     field_copy ( e->z, L0 );
-    sub_tw_niels_from_tw_extensible( e, &a->n );
+    sub_tw_niels_from_tw_extensible( e, a->n );
 }
 
 void
 double_tw_extensible (
-    struct tw_extensible_t* a
+    tw_extensible_a_t a
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1, L2;
@@ -105,7 +105,7 @@ double_tw_extensible (
 
 void
 double_extensible (
-    struct extensible_t* a
+    extensible_a_t a
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1, L2;
@@ -130,8 +130,8 @@ double_extensible (
 
 void
 twist_and_double (
-    struct tw_extensible_t*    b,
-    const struct extensible_t* a
+    tw_extensible_a_t    b,
+    const extensible_a_t a
 ) {
     field_a_t L0;
     field_sqr ( b->x, a->x );
@@ -151,8 +151,8 @@ twist_and_double (
 
 void
 untwist_and_double (
-    struct extensible_t*          b,
-    const struct tw_extensible_t* a
+    extensible_a_t          b,
+    const tw_extensible_a_t a
 ) {
     field_a_t L0;
     field_sqr ( b->x, a->x );
@@ -172,20 +172,20 @@ untwist_and_double (
 
 void
 convert_tw_affine_to_tw_pniels (
-    struct tw_pniels_t*       b,
-    const struct tw_affine_t* a
+    tw_pniels_a_t       b,
+    const tw_affine_a_t a
 ) {
-    field_sub ( b->n.a, a->y, a->x );
-    field_add ( b->n.b, a->x, a->y );
+    field_sub ( b->n->a, a->y, a->x );
+    field_add ( b->n->b, a->x, a->y );
     field_mul ( b->z, a->y, a->x );
-    field_mulw_scc_wr ( b->n.c, b->z, 2*EDWARDS_D-2 );
+    field_mulw_scc_wr ( b->n->c, b->z, 2*EDWARDS_D-2 );
     field_set_ui( b->z, 2 );
 }
 
 void
 convert_tw_affine_to_tw_extensible (
-    struct tw_extensible_t*   b,
-    const struct tw_affine_t* a
+    tw_extensible_a_t   b,
+    const tw_affine_a_t a
 ) {
     field_copy ( b->x, a->x );
     field_copy ( b->y, a->y );
@@ -196,8 +196,8 @@ convert_tw_affine_to_tw_extensible (
 
 void
 convert_affine_to_extensible (
-    struct extensible_t*   b,
-    const struct affine_t* a
+    extensible_a_t   b,
+    const affine_a_t a
 ) {
     field_copy ( b->x, a->x );
     field_copy ( b->y, a->y );
@@ -208,23 +208,23 @@ convert_affine_to_extensible (
 
 void
 convert_tw_extensible_to_tw_pniels (
-    struct tw_pniels_t*           b,
-    const struct tw_extensible_t* a
+    tw_pniels_a_t           b,
+    const tw_extensible_a_t a
 ) {
-    field_sub ( b->n.a, a->y, a->x );
-    field_add ( b->n.b, a->x, a->y );
+    field_sub ( b->n->a, a->y, a->x );
+    field_add ( b->n->b, a->x, a->y );
     field_mul ( b->z, a->u, a->t );
-    field_mulw_scc_wr ( b->n.c, b->z, 2*EDWARDS_D-2 );
+    field_mulw_scc_wr ( b->n->c, b->z, 2*EDWARDS_D-2 );
     field_add ( b->z, a->z, a->z );
 }
 
 void
 convert_tw_pniels_to_tw_extensible (
-    struct tw_extensible_t*   e,
-    const struct tw_pniels_t* d
+    tw_extensible_a_t   e,
+    const tw_pniels_a_t d
 ) {
-    field_add ( e->u, d->n.b, d->n.a );
-    field_sub ( e->t, d->n.b, d->n.a );
+    field_add ( e->u, d->n->b, d->n->a );
+    field_sub ( e->t, d->n->b, d->n->a );
     field_mul ( e->x, d->z, e->t );
     field_mul ( e->y, d->z, e->u );
     field_sqr ( e->z, d->z );
@@ -232,8 +232,8 @@ convert_tw_pniels_to_tw_extensible (
 
 void
 convert_tw_niels_to_tw_extensible (
-    struct tw_extensible_t*  e,
-    const struct tw_niels_t* d
+    tw_extensible_a_t  e,
+    const tw_niels_a_t d
 ) {
     field_add ( e->y, d->b, d->a );
     field_sub ( e->x, d->b, d->a );
@@ -244,7 +244,7 @@ convert_tw_niels_to_tw_extensible (
 
 void
 montgomery_step (
-    struct montgomery_t* a
+    montgomery_a_t a
 ) {
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
@@ -272,7 +272,7 @@ montgomery_step (
 
 void
 deserialize_montgomery (
-    struct montgomery_t* a,
+    montgomery_a_t a,
     const field_a_t sbz
 ) {
     field_sqr ( a->z0, sbz );
@@ -285,7 +285,7 @@ deserialize_montgomery (
 mask_t
 serialize_montgomery (
     field_a_t             b,
-    const struct montgomery_t* a,
+    const montgomery_a_t a,
     const field_a_t       sbz
 ) {
     mask_t L4, L5, L6;
@@ -332,7 +332,7 @@ serialize_montgomery (
 void
 serialize_extensible (
     field_a_t             b,
-    const struct extensible_t* a
+    const extensible_a_t a
 ) {
     field_a_t L0, L1, L2;
     field_sub ( L0, a->y, a->z );
@@ -351,7 +351,7 @@ serialize_extensible (
 void
 untwist_and_double_and_serialize (
     field_a_t                b,
-    const struct tw_extensible_t* a
+    const tw_extensible_a_t a
 ) {
     field_a_t L0, L1, L2, L3;
     field_mul ( L3, a->y, a->x );
@@ -375,8 +375,8 @@ untwist_and_double_and_serialize (
 
 void
 twist_even (
-    struct tw_extensible_t*    b,
-    const struct extensible_t* a
+    tw_extensible_a_t    b,
+    const extensible_a_t a
 ) {
     field_sqr ( b->y, a->z );
     field_sqr ( b->z, a->x );
@@ -401,8 +401,8 @@ twist_even (
 
 void
 test_only_twist (
-    struct tw_extensible_t*    b,
-    const struct extensible_t* a
+    tw_extensible_a_t    b,
+    const extensible_a_t a
 ) {
     field_a_t L0, L1;
     field_sqr ( b->u, a->z );
@@ -436,7 +436,7 @@ test_only_twist (
 
 mask_t
 is_even_pt (
-    const struct extensible_t* a
+    const extensible_a_t a
 ) {
     field_a_t L0, L1, L2;
     field_sqr ( L2, a->z );
@@ -447,7 +447,7 @@ is_even_pt (
 
 mask_t
 is_even_tw (
-    const struct tw_extensible_t* a
+    const tw_extensible_a_t a
 ) {
     field_a_t L0, L1, L2;
     field_sqr ( L2, a->z );
@@ -458,7 +458,7 @@ is_even_tw (
 
 mask_t
 deserialize_affine (
-    struct affine_t*     a,
+    affine_a_t     a,
     const field_a_t sz
 ) {
     field_a_t L0, L1, L2, L3;
@@ -492,7 +492,7 @@ deserialize_affine (
 
 mask_t
 deserialize_and_twist_approx (
-    struct tw_extensible_t* a,
+    tw_extensible_a_t a,
     const field_a_t    sz
 ) {
     field_a_t L0, L1;
@@ -537,7 +537,7 @@ deserialize_and_twist_approx (
 
 void
 set_identity_extensible (
-    struct extensible_t* a
+    extensible_a_t a
 ) {
     field_set_ui( a->x, 0 );
     field_set_ui( a->y, 1 );
@@ -548,7 +548,7 @@ set_identity_extensible (
 
 void
 set_identity_tw_extensible (
-    struct tw_extensible_t* a
+    tw_extensible_a_t a
 ) {
     field_set_ui( a->x, 0 );
     field_set_ui( a->y, 1 );
@@ -559,7 +559,7 @@ set_identity_tw_extensible (
 
 void
 set_identity_affine (
-    struct affine_t* a
+    affine_a_t a
 ) {
     field_set_ui( a->x, 0 );
     field_set_ui( a->y, 1 );
@@ -567,8 +567,8 @@ set_identity_affine (
 
 mask_t
 eq_affine (
-    const struct affine_t* a,
-    const struct affine_t* b
+    const affine_a_t a,
+    const affine_a_t b
 ) {
     mask_t L1, L2;
     field_a_t L0;
@@ -581,8 +581,8 @@ eq_affine (
 
 mask_t
 eq_extensible (
-    const struct extensible_t* a,
-    const struct extensible_t* b
+    const extensible_a_t a,
+    const extensible_a_t b
 ) {
     mask_t L3, L4;
     field_a_t L0, L1, L2;
@@ -599,8 +599,8 @@ eq_extensible (
 
 mask_t
 eq_tw_extensible (
-    const struct tw_extensible_t* a,
-    const struct tw_extensible_t* b
+    const tw_extensible_a_t a,
+    const tw_extensible_a_t b
 ) {
     mask_t L3, L4;
     field_a_t L0, L1, L2;
@@ -617,7 +617,7 @@ eq_tw_extensible (
 
 void
 elligator_2s_inject (
-    struct affine_t*     a,
+    affine_a_t     a,
     const field_a_t r
 ) {
     field_a_t L2, L3, L4, L5, L6, L7, L8;
@@ -676,7 +676,7 @@ elligator_2s_inject (
 
 mask_t
 validate_affine (
-    const struct affine_t* a
+    const affine_a_t a
 ) {
     field_a_t L0, L1, L2, L3;
     field_sqr ( L0, a->y );
@@ -691,7 +691,7 @@ validate_affine (
 
 mask_t
 validate_tw_extensible (
-    const struct tw_extensible_t* ext
+    const tw_extensible_a_t ext
 ) {
     mask_t L4, L5;
     field_a_t L0, L1, L2, L3;
@@ -728,7 +728,7 @@ validate_tw_extensible (
 
 mask_t
 validate_extensible (
-    const struct extensible_t* ext
+    const extensible_a_t ext
 ) {
     mask_t L4, L5;
     field_a_t L0, L1, L2, L3;
diff --git a/src/goldilocks.c b/src/goldilocks.c
index 7cba9c4..1c647f4 100644
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -41,7 +41,7 @@
 #else
 #define FIELD_HASH_BYTES (SHA512_OUTPUT_BYTES * ((FIELD_BYTES-1)/SHA512_OUTPUT_BYTES + 1))
 static inline void field_hash_final (
-    struct sha512_ctx_t *ctx,
+    sha512_ctx_a_t *ctx,
     unsigned char out[FIELD_HASH_BYTES]
 ) {
     /* SHA PRNG I guess? I really should have used SHAKE */
@@ -67,19 +67,19 @@ struct goldilocks_precomputed_public_key_t {
 
 /* FUTURE: auto. */
 static struct {
-    const char * volatile state;
+    const char * volatile status;
 #if GOLDILOCKS_USE_PTHREAD
     pthread_mutex_t mutex;
 #endif
-    struct tw_niels_t combs[COMB_N << (COMB_T-1)];
+    tw_niels_a_t combs[COMB_N << (COMB_T-1)];
     struct fixed_base_table_t fixed_base;
-    struct tw_niels_t wnafs[1<<WNAF_PRECMP_BITS];
-    struct crandom_state_t rand;
+    tw_niels_a_t wnafs[1<<WNAF_PRECMP_BITS];
+    crandom_state_a_t rand;
 } goldilocks_global;
 
 static inline mask_t
 goldilocks_check_init(void) {
-    if (likely(goldilocks_global.state == G_INITED)) {
+    if (likely(goldilocks_global.status == G_INITED)) {
         return MASK_SUCCESS;
     } else {
         return MASK_FAILURE;
@@ -88,7 +88,7 @@ goldilocks_check_init(void) {
 
 int
 goldilocks_init (void) {
-    const char *res = compare_and_swap(&goldilocks_global.state, NULL, G_INITING);
+    const char *res = compare_and_swap(&goldilocks_global.status, NULL, G_INITING);
     if (res == G_INITED) return GOLDI_EALREADYINIT;
     else if (res) {
         return GOLDI_ECORRUPT;
@@ -99,37 +99,37 @@ goldilocks_init (void) {
     if (ret) goto fail;
 #endif
     
-    struct extensible_t ext;
-    struct tw_extensible_t text;
+    extensible_a_t ext;
+    tw_extensible_a_t text;
     
     /* Sanity check: the base point is on the curve. */
-    assert(validate_affine(&goldilocks_base_point));
+    assert(validate_affine(goldilocks_base_point));
     
     /* Convert it to twisted Edwards. */
-    convert_affine_to_extensible(&ext, &goldilocks_base_point);
-    twist_even(&text, &ext);
+    convert_affine_to_extensible(ext, goldilocks_base_point);
+    twist_even(text, ext);
     
     /* Precompute the tables. */
     mask_t succ;
 
-    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, &text,
+    succ =  precompute_fixed_base(&goldilocks_global.fixed_base, text,
         COMB_N, COMB_T, COMB_S, goldilocks_global.combs);
-    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, &text, WNAF_PRECMP_BITS);
+    succ &= precompute_fixed_base_wnaf(goldilocks_global.wnafs, text, WNAF_PRECMP_BITS);
     
-    int criff_res = crandom_init_from_file(&goldilocks_global.rand,
+    int criff_res = crandom_init_from_file(goldilocks_global.rand,
         GOLDILOCKS_RANDOM_INIT_FILE,
         GOLDILOCKS_RANDOM_RESEED_INTERVAL,
         GOLDILOCKS_RANDOM_RESEEDS_MANDATORY);
         
 #ifdef SUPERCOP_WONT_LET_ME_OPEN_FILES
     if (criff_res == EMFILE) {
-        crandom_init_from_buffer(&goldilocks_global.rand, "SUPERCOP won't let me open files");
+        crandom_init_from_buffer(goldilocks_global.rand, "SUPERCOP won't let me open files");
         criff_res = 0;
     }
 #endif
         
     if (succ & !criff_res) {
-        if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_INITED)) {
+        if (!bool_compare_and_swap(&goldilocks_global.status, G_INITING, G_INITED)) {
             abort();
         }
         return 0;
@@ -138,7 +138,7 @@ goldilocks_init (void) {
     /* it failed! fall though... */
 
 fail:
-    if (!bool_compare_and_swap(&goldilocks_global.state, G_INITING, G_FAILED)) {
+    if (!bool_compare_and_swap(&goldilocks_global.status, G_INITING, G_FAILED)) {
         /* ok something is seriously wrong */
         abort();
     }
@@ -160,20 +160,20 @@ goldilocks_derive_private_key (
     word_t sk[GOLDI_FIELD_WORDS];
     assert(sizeof(skb) >= sizeof(sk));
     
-    struct sha512_ctx_t ctx;
-    struct tw_extensible_t exta;
+    sha512_ctx_a_t ctx;
+    tw_extensible_a_t exta;
     field_a_t pk;
     
-    sha512_init(&ctx);
-    sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
-    sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
-    field_hash_final(&ctx, (unsigned char *)skb);
+    sha512_init(ctx);
+    sha512_update(ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
+    sha512_update(ctx, proto, GOLDI_SYMKEY_BYTES);
+    field_hash_final(ctx, (unsigned char *)skb);
 
     barrett_deserialize_and_reduce(sk, skb, sizeof(skb), &curve_prime_order);
     barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);
 
-    scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
-    untwist_and_double_and_serialize(pk, &exta);
+    scalarmul_fixed_base(exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
+    untwist_and_double_and_serialize(pk, exta);
     
     field_serialize(&privkey->opaque[GOLDI_FIELD_BYTES], pk);
     
@@ -204,7 +204,7 @@ goldilocks_keygen (
     if (ml_ret) return ml_ret;
 #endif
 
-    int ret = crandom_generate(&goldilocks_global.rand, proto, sizeof(proto));
+    int ret = crandom_generate(goldilocks_global.rand, proto, sizeof(proto));
 
 #if GOLDILOCKS_USE_PTHREAD
     ml_ret = pthread_mutex_unlock(&goldilocks_global.mutex);
@@ -267,9 +267,9 @@ goldilocks_shared_secret_core (
     
 #if GOLDI_IMPLEMENT_PRECOMPUTED_KEYS
     if (pre) {
-        struct tw_extensible_t tw;
-        succ &= scalarmul_fixed_base(&tw, sk, GOLDI_SCALAR_BITS, &pre->table);
-        untwist_and_double_and_serialize(pk, &tw);
+        tw_extensible_a_t tw;
+        succ &= scalarmul_fixed_base(tw, sk, GOLDI_SCALAR_BITS, &pre->table);
+        untwist_and_double_and_serialize(pk, tw);
     } else {
         succ &= montgomery_ladder(pk,pk,sk,GOLDI_SCALAR_BITS,1);
     }
@@ -282,8 +282,8 @@ goldilocks_shared_secret_core (
     field_serialize(gxy,pk);
     
     /* obliterate records of our failure by adjusting with obliteration key */
-    struct sha512_ctx_t ctx;
-    sha512_init(&ctx);
+    sha512_ctx_a_t ctx;
+    sha512_init(ctx);
 
 #ifdef EXPERIMENT_ECDH_OBLITERATE_CT
     uint8_t oblit[GOLDI_DIVERSIFY_BYTES + GOLDI_SYMKEY_BYTES];
@@ -294,21 +294,21 @@ goldilocks_shared_secret_core (
     for (i=0; i<GOLDI_SYMKEY_BYTES; i++) {
         oblit[GOLDI_DIVERSIFY_BYTES+i] = my_privkey->opaque[2*GOLDI_FIELD_BYTES+i] & ~(succ&msucc);
     }
-    sha512_update(&ctx, oblit, sizeof(oblit));
+    sha512_update(ctx, oblit, sizeof(oblit));
 #endif
     
 #ifdef EXPERIMENT_ECDH_STIR_IN_PUBKEYS
     /* stir in the sum and product of the pubkeys. */
     uint8_t a_pk[GOLDI_FIELD_BYTES];
     field_serialize(a_pk, sum);
-    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
+    sha512_update(ctx, a_pk, GOLDI_FIELD_BYTES);
     field_serialize(a_pk, prod);
-    sha512_update(&ctx, a_pk, GOLDI_FIELD_BYTES);
+    sha512_update(ctx, a_pk, GOLDI_FIELD_BYTES);
 #endif
        
     /* stir in the shared key and finish */
-    sha512_update(&ctx, gxy, GOLDI_FIELD_BYTES);
-    sha512_final(&ctx, shared);
+    sha512_update(ctx, gxy, GOLDI_FIELD_BYTES);
+    sha512_final(ctx, shared);
     
     return (GOLDI_ECORRUPT & ~msucc)
         | (GOLDI_EINVAL & msucc &~ succ)
@@ -340,12 +340,12 @@ goldilocks_derive_challenge(
 ) {
     /* challenge = H(pk, [nonceG], message). */
     unsigned char sha_out[FIELD_HASH_BYTES];
-    struct sha512_ctx_t ctx;
-    sha512_init(&ctx);
-    sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES);
-    sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
-    sha512_update(&ctx, message, message_len);
-    field_hash_final(&ctx, sha_out);
+    sha512_ctx_a_t ctx;
+    sha512_init(ctx);
+    sha512_update(ctx, pubkey, GOLDI_FIELD_BYTES);
+    sha512_update(ctx, gnonce, GOLDI_FIELD_BYTES);
+    sha512_update(ctx, message, message_len);
+    field_hash_final(ctx, sha_out);
     barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order);
 }
 
@@ -371,22 +371,22 @@ goldilocks_sign (
     /* Derive a nonce.  TODO: use HMAC. FUTURE: factor. */
     unsigned char sha_out[FIELD_HASH_BYTES];
     word_t tk[GOLDI_FIELD_WORDS];
-    struct sha512_ctx_t ctx;
-    sha512_init(&ctx);
-    sha512_update(&ctx, (const unsigned char *)"signonce", 8);
-    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
-    sha512_update(&ctx, message, message_len);
-    sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
-    field_hash_final(&ctx, sha_out);
+    sha512_ctx_a_t ctx;
+    sha512_init(ctx);
+    sha512_update(ctx, (const unsigned char *)"signonce", 8);
+    sha512_update(ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
+    sha512_update(ctx, message, message_len);
+    sha512_update(ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
+    field_hash_final(ctx, sha_out);
     barrett_deserialize_and_reduce(tk, sha_out, sizeof(sha_out), &curve_prime_order);
     
     /* 4[nonce]G */
     uint8_t signature_tmp[GOLDI_FIELD_BYTES];
-    struct tw_extensible_t exta;
+    tw_extensible_a_t exta;
     field_a_t gsk;
-    scalarmul_fixed_base(&exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
-    double_tw_extensible(&exta);
-    untwist_and_double_and_serialize(gsk, &exta);
+    scalarmul_fixed_base(exta, tk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
+    double_tw_extensible(exta);
+    untwist_and_double_and_serialize(gsk, exta);
     field_serialize(signature_tmp, gsk);
     
     word_t challenge[GOLDI_FIELD_WORDS];
@@ -450,21 +450,21 @@ goldilocks_verify (
     goldilocks_derive_challenge(challenge, pubkey->opaque, signature, message, message_len);
     
     field_a_t eph;
-    struct tw_extensible_t pk_text;
+    tw_extensible_a_t pk_text;
     
     /* deserialize [nonce]G */
     succ = field_deserialize(eph, signature);
     if (!succ) return GOLDI_EINVAL;
     
-    succ = deserialize_and_twist_approx(&pk_text, pk);
+    succ = deserialize_and_twist_approx(pk_text, pk);
     if (!succ) return GOLDI_EINVAL;
     
-    linear_combo_var_fixed_vt( &pk_text,
+    linear_combo_var_fixed_vt( pk_text,
         challenge, GOLDI_SCALAR_BITS,
         s, GOLDI_SCALAR_BITS,
         goldilocks_global.wnafs, WNAF_PRECMP_BITS );
     
-    untwist_and_double_and_serialize( pk, &pk_text );
+    untwist_and_double_and_serialize( pk, pk_text );
 
     succ = field_eq(eph, pk);
     return succ ? 0 : GOLDI_EINVAL;
@@ -483,7 +483,7 @@ goldilocks_precompute_public_key (
     
     if (!precom) return NULL;
     
-    struct tw_extensible_t pk_text;
+    tw_extensible_a_t pk_text;
     
     field_a_t pk;
     mask_t succ = field_deserialize(pk, pub->opaque);
@@ -492,13 +492,13 @@ goldilocks_precompute_public_key (
         return NULL;
     }
     
-    succ = deserialize_and_twist_approx(&pk_text, pk);
+    succ = deserialize_and_twist_approx(pk_text, pk);
     if (!succ) {
         free(precom);
         return NULL;
     }
 
-    succ =  precompute_fixed_base(&precom->table, &pk_text,
+    succ =  precompute_fixed_base(&precom->table, pk_text,
         COMB_N, COMB_T, COMB_S, NULL);
     if (!succ) {
         free(precom);
@@ -539,20 +539,20 @@ goldilocks_verify_precomputed (
     goldilocks_derive_challenge(challenge, pubkey->pub.opaque, signature, message, message_len);
     
     field_a_t eph, pk;
-    struct tw_extensible_t pk_text;
+    tw_extensible_a_t pk_text;
     
     /* deserialize [nonce]G */
     succ = field_deserialize(eph, signature);
     if (!succ) return GOLDI_EINVAL;
         
     succ = linear_combo_combs_vt (
-        &pk_text,
+        pk_text,
         challenge, GOLDI_SCALAR_BITS, &pubkey->table,
         s, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base
     );
     if (!succ) return GOLDI_EINVAL;
     
-    untwist_and_double_and_serialize( pk, &pk_text );
+    untwist_and_double_and_serialize( pk, pk_text );
 
     succ = field_eq(eph, pk);
     return succ ? 0 : GOLDI_EINVAL;
diff --git a/src/include/crandom.h b/src/include/crandom.h
index 90cc374..c9f4c26 100644
--- a/src/include/crandom.h
+++ b/src/include/crandom.h
@@ -39,6 +39,7 @@ struct crandom_state_t {
     int reseeds_mandatory;
     int randomfd;
 } __attribute__((aligned(16))) ;
+typedef struct crandom_state_t crandom_state_a_t[1];
 
 #ifdef __cplusplus
 extern "C" {
@@ -64,7 +65,7 @@ extern "C" {
  */
 int
 crandom_init_from_file (
-    struct crandom_state_t *state,
+    crandom_state_a_t state,
     const char *filename,
     int reseed_interval,
     int reseeds_mandatory
@@ -87,7 +88,7 @@ crandom_init_from_file (
  */
 void
 crandom_init_from_buffer (
-    struct crandom_state_t *state,
+    crandom_state_a_t state,
     const char initial_seed[32]
 );
 
@@ -118,7 +119,7 @@ crandom_init_from_buffer (
  */
 int
 crandom_generate (
-    struct crandom_state_t *state,
+    crandom_state_a_t state,
     unsigned char *output,
     unsigned long long length
 );
@@ -131,7 +132,7 @@ crandom_generate (
  */
 void
 crandom_destroy (
-    struct crandom_state_t *state
+    crandom_state_a_t state
 );
 
 #ifdef __cplusplus
diff --git a/src/include/ec_point.h b/src/include/ec_point.h
index 9d0f4f3..54ab9cb 100644
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -20,23 +20,23 @@ extern "C" {
 /**
  * Affine point on an Edwards curve.
  */
-struct affine_t {
+typedef struct affine_t {
     field_a_t x, y;
-};
+} affine_a_t[1];
 
 /**
  * Affine point on a twisted Edwards curve.
  */
-struct tw_affine_t {
+typedef struct tw_affine_t {
     field_a_t x, y;
-};
+} tw_affine_a_t[1];
 
 /**
  * Montgomery buffer.
  */
-struct montgomery_t {
+typedef struct montgomery_t {
     field_a_t z0, xd, zd, xa, za;
-};
+} montgomery_a_t[1];
 
 /**
  * Extensible coordinates for Edwards curves, suitable for
@@ -56,36 +56,36 @@ struct montgomery_t {
  * lookahead trick.  It might be worth considering that trick
  * instead.
  */
-struct extensible_t {
+typedef struct extensible_t {
     field_a_t x, y, z, t, u;
-};
+} extensible_a_t[1];
 
 /**
  * Extensible coordinates for twisted Edwards curves,
  * suitable for accumulators.
  */
-struct tw_extensible_t {
+typedef struct tw_extensible_t {
     field_a_t x, y, z, t, u;
-};
+} tw_extensible_a_t[1];
 
 /**
  * Niels coordinates for twisted Edwards curves.
  * 
  * Good for mixed readdition; suitable for fixed tables.
  */
-struct tw_niels_t {
+typedef struct tw_niels_t {
     field_a_t a, b, c;
-};
+} tw_niels_a_t[1];
 
 /**
  * Projective niels coordinates for twisted Edwards curves.
  * 
  * Good for readdition; suitable for temporary tables.
  */
-struct tw_pniels_t {
-    struct tw_niels_t n;
+typedef struct tw_pniels_t {
+    tw_niels_a_t n;
     field_a_t z;
-};
+} tw_pniels_a_t[1];
 
 
 /**
@@ -93,8 +93,8 @@ struct tw_pniels_t {
  */
 static __inline__ void
 copy_affine (
-    struct affine_t*       a,
-    const struct affine_t* ds
+    affine_a_t       a,
+    const affine_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -102,8 +102,8 @@ copy_affine (
  */
 static __inline__ void
 copy_tw_affine (
-    struct tw_affine_t*       a,
-    const struct tw_affine_t* ds
+    tw_affine_a_t       a,
+    const tw_affine_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -111,8 +111,8 @@ copy_tw_affine (
  */
 static __inline__ void
 copy_montgomery (
-    struct montgomery_t*       a,
-    const struct montgomery_t* ds
+    montgomery_a_t       a,
+    const montgomery_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -120,8 +120,8 @@ copy_montgomery (
  */
 static __inline__ void
 copy_extensible (
-    struct extensible_t*       a,
-    const struct extensible_t* ds
+    extensible_a_t       a,
+    const extensible_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -129,8 +129,8 @@ copy_extensible (
  */
 static __inline__ void
 copy_tw_extensible (
-    struct tw_extensible_t*       a,
-    const struct tw_extensible_t* ds
+    tw_extensible_a_t       a,
+    const tw_extensible_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -138,8 +138,8 @@ copy_tw_extensible (
  */
 static __inline__ void
 copy_tw_niels (
-    struct tw_niels_t*       a,
-    const struct tw_niels_t* ds
+    tw_niels_a_t       a,
+    const tw_niels_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -147,8 +147,8 @@ copy_tw_niels (
  */
 static __inline__ void
 copy_tw_pniels (
-    struct tw_pniels_t*       a,
-    const struct tw_pniels_t* ds
+    tw_pniels_a_t       a,
+    const tw_pniels_a_t ds
 ) __attribute__((unused,always_inline));
 
 /**
@@ -157,8 +157,8 @@ copy_tw_pniels (
  */
 void
 add_tw_niels_to_tw_extensible (
-    struct tw_extensible_t*  d,
-    const struct tw_niels_t* e
+    tw_extensible_a_t  d,
+    const tw_niels_a_t e
 );
 
 /**
@@ -167,8 +167,8 @@ add_tw_niels_to_tw_extensible (
  */
 void
 sub_tw_niels_from_tw_extensible (
-    struct tw_extensible_t*  d,
-    const struct tw_niels_t* e
+    tw_extensible_a_t  d,
+    const tw_niels_a_t e
 );
 
 /**
@@ -177,8 +177,8 @@ sub_tw_niels_from_tw_extensible (
  */
 void
 add_tw_pniels_to_tw_extensible (
-    struct tw_extensible_t*   e,
-    const struct tw_pniels_t* a
+    tw_extensible_a_t   e,
+    const tw_pniels_a_t a
 );
 
 /**
@@ -187,8 +187,8 @@ add_tw_pniels_to_tw_extensible (
  */
 void
 sub_tw_pniels_from_tw_extensible (
-    struct tw_extensible_t*   e,
-    const struct tw_pniels_t* a
+    tw_extensible_a_t   e,
+    const tw_pniels_a_t a
 );
 
 /**
@@ -196,7 +196,7 @@ sub_tw_pniels_from_tw_extensible (
  */
 void
 double_tw_extensible (
-    struct tw_extensible_t* a
+    tw_extensible_a_t a
 );
 
 /**
@@ -204,7 +204,7 @@ double_tw_extensible (
  */
 void
 double_extensible (
-    struct extensible_t* a
+    extensible_a_t a
 );
 
 /**
@@ -214,8 +214,8 @@ double_extensible (
  */
 void
 twist_and_double (
-    struct tw_extensible_t*    b,
-    const struct extensible_t* a
+    tw_extensible_a_t    b,
+    const extensible_a_t a
 );
 
 /**
@@ -225,61 +225,61 @@ twist_and_double (
  */
 void
 untwist_and_double (
-    struct extensible_t*          b,
-    const struct tw_extensible_t* a
+    extensible_a_t          b,
+    const tw_extensible_a_t a
 );
 
 void
 convert_tw_affine_to_tw_pniels (
-    struct tw_pniels_t*       b,
-    const struct tw_affine_t* a
+    tw_pniels_a_t       b,
+    const tw_affine_a_t a
 );
 
 void
 convert_tw_affine_to_tw_extensible (
-    struct tw_extensible_t*   b,
-    const struct tw_affine_t* a
+    tw_extensible_a_t   b,
+    const tw_affine_a_t a
 );
 
 void
 convert_affine_to_extensible (
-    struct extensible_t*   b,
-    const struct affine_t* a
+    extensible_a_t   b,
+    const affine_a_t a
 );
 
 void
 convert_tw_extensible_to_tw_pniels (
-    struct tw_pniels_t*           b,
-    const struct tw_extensible_t* a
+    tw_pniels_a_t           b,
+    const tw_extensible_a_t a
 );
 
 void
 convert_tw_pniels_to_tw_extensible (
-    struct tw_extensible_t*   e,
-    const struct tw_pniels_t* d
+    tw_extensible_a_t   e,
+    const tw_pniels_a_t d
 );
 
 void
 convert_tw_niels_to_tw_extensible (
-    struct tw_extensible_t*  e,
-    const struct tw_niels_t* d
+    tw_extensible_a_t  e,
+    const tw_niels_a_t d
 );
 
 void
 montgomery_step (
-    struct montgomery_t* a
+    montgomery_a_t a
 );
 
 void
 deserialize_montgomery (
-    struct montgomery_t* a,
+    montgomery_a_t a,
     const field_a_t sbz
 );
 
 mask_t
 serialize_montgomery (
     field_a_t             b,
-    const struct montgomery_t* a,
+    const montgomery_a_t a,
     const field_a_t       sbz
 );
 
@@ -297,7 +297,7 @@ serialize_montgomery (
 void
 serialize_extensible (
     field_a_t             b,
-    const struct extensible_t* a
+    const extensible_a_t a
 );
 
 /**
@@ -306,7 +306,7 @@ serialize_extensible (
 void
 untwist_and_double_and_serialize (
     field_a_t                b,
-    const struct tw_extensible_t* a
+    const tw_extensible_a_t a
 );
 
 /**
@@ -320,8 +320,8 @@ untwist_and_double_and_serialize (
  */
 void
 twist_even (
-    struct tw_extensible_t*    b,
-    const struct extensible_t* a
+    tw_extensible_a_t    b,
+    const extensible_a_t a
 );
 
 /**
@@ -339,8 +339,8 @@ twist_even (
  */
 void
 test_only_twist (
-    struct tw_extensible_t*    b,
-    const struct extensible_t* a
+    tw_extensible_a_t    b,
+    const extensible_a_t a
 );
 
 mask_t
@@ -350,12 +350,12 @@ field_is_square (
 
 mask_t
 is_even_pt (
-    const struct extensible_t* a
+    const extensible_a_t a
 );
 
 mask_t
 is_even_tw (
-    const struct tw_extensible_t* a
+    const tw_extensible_a_t a
 );
 
 /**
@@ -363,7 +363,7 @@ is_even_tw (
  */
 mask_t
 deserialize_affine (
-    struct affine_t*     a,
+    affine_a_t     a,
     const field_a_t sz
 );
 
@@ -376,52 +376,52 @@ deserialize_affine (
  */
 mask_t
 deserialize_and_twist_approx (
-    struct tw_extensible_t* a,
+    tw_extensible_a_t a,
     const field_a_t    sz
 );
 
 void
 set_identity_extensible (
-    struct extensible_t* a
+    extensible_a_t a
 );
 
 void
 set_identity_tw_extensible (
-    struct tw_extensible_t* a
+    tw_extensible_a_t a
 );
 
 void
 set_identity_affine (
-    struct affine_t* a
+    affine_a_t a
 );
 
 mask_t
 eq_affine (
-    const struct affine_t* a,
-    const struct affine_t* b
+    const affine_a_t a,
+    const affine_a_t b
 );
 
 mask_t
 eq_extensible (
-    const struct extensible_t* a,
-    const struct extensible_t* b
+    const extensible_a_t a,
+    const extensible_a_t b
 );
 
 mask_t
 eq_tw_extensible (
-    const struct tw_extensible_t* a,
-    const struct tw_extensible_t* b
+    const tw_extensible_a_t a,
+    const tw_extensible_a_t b
 );
 
 void
 elligator_2s_inject (
-    struct affine_t*     a,
+    affine_a_t     a,
     const field_a_t r
 );
 
 mask_t
 validate_affine (
-    const struct affine_t* a
+    const affine_a_t a
 );
 
 /**
@@ -431,7 +431,7 @@ validate_affine (
  */
 mask_t
 validate_tw_extensible (
-    const struct tw_extensible_t* ext
+    const tw_extensible_a_t ext
 );
 
 /**
@@ -441,7 +441,7 @@ validate_tw_extensible (
  */
 mask_t
 validate_extensible (
-    const struct extensible_t* ext
+    const extensible_a_t ext
 );
 
 /**
@@ -450,7 +450,7 @@ validate_extensible (
 static __inline__ void
 __attribute__((unused))
 cond_negate_tw_niels (
-    struct tw_niels_t *n,
+    tw_niels_a_t n,
     mask_t doNegate
 ) {
     constant_time_cond_swap(n->a, n->b, sizeof(n->a), doNegate);
@@ -463,16 +463,16 @@ cond_negate_tw_niels (
 static __inline__ void
 __attribute__((unused))
 cond_negate_tw_pniels (
-    struct tw_pniels_t *n,
+    tw_pniels_a_t n,
     mask_t doNegate
 ) {
-    cond_negate_tw_niels(&n->n, doNegate);
+    cond_negate_tw_niels(n->n, doNegate);
 }
 
 void
 copy_affine (
-    struct affine_t*       a,
-    const struct affine_t* ds
+    affine_a_t       a,
+    const affine_a_t ds
 ) {
     field_copy ( a->x, ds->x );
     field_copy ( a->y, ds->y );
@@ -480,8 +480,8 @@ copy_affine (
 
 void
 copy_tw_affine (
-    struct tw_affine_t*       a,
-    const struct tw_affine_t* ds
+    tw_affine_a_t       a,
+    const tw_affine_a_t ds
 ) {
     field_copy ( a->x, ds->x );
     field_copy ( a->y, ds->y );
@@ -489,8 +489,8 @@ copy_tw_affine (
 
 void
 copy_montgomery (
-    struct montgomery_t*       a,
-    const struct montgomery_t* ds
+    montgomery_a_t       a,
+    const montgomery_a_t ds
 ) {
     field_copy ( a->z0, ds->z0 );
     field_copy ( a->xd, ds->xd );
@@ -501,8 +501,8 @@ copy_montgomery (
 
 void
 copy_extensible (
-    struct extensible_t*       a,
-    const struct extensible_t* ds
+    extensible_a_t       a,
+    const extensible_a_t ds
 ) {
     field_copy ( a->x, ds->x );
     field_copy ( a->y, ds->y );
@@ -513,8 +513,8 @@ copy_extensible (
 
 void
 copy_tw_extensible (
-    struct tw_extensible_t*       a,
-    const struct tw_extensible_t* ds
+    tw_extensible_a_t       a,
+    const tw_extensible_a_t ds
 ) {
     field_copy ( a->x, ds->x );
     field_copy ( a->y, ds->y );
@@ -525,8 +525,8 @@ copy_tw_extensible (
 
 void
 copy_tw_niels (
-    struct tw_niels_t*       a,
-    const struct tw_niels_t* ds
+    tw_niels_a_t       a,
+    const tw_niels_a_t ds
 ) {
     field_copy ( a->a, ds->a );
     field_copy ( a->b, ds->b );
@@ -535,10 +535,10 @@ copy_tw_niels (
 
 void
 copy_tw_pniels (
-    struct tw_pniels_t*       a,
-    const struct tw_pniels_t* ds
+    tw_pniels_a_t       a,
+    const tw_pniels_a_t ds
 ) {
-    copy_tw_niels( &a->n, &ds->n );
+    copy_tw_niels( a->n, ds->n );
     field_copy ( a->z, ds->z );
 }
 
diff --git a/src/include/magic.h b/src/include/magic.h
index 1627a6b..1d186f2 100644
--- a/src/include/magic.h
+++ b/src/include/magic.h
@@ -50,7 +50,7 @@ extern const field_a_t sqrt_d_minus_1;
 /**
  * @brief The base point for Goldilocks.
  */
-extern const struct affine_t goldilocks_base_point;
+extern const affine_a_t goldilocks_base_point;
 
 /**
  * @brief The Goldilocks prime subgroup order.
diff --git a/src/include/scalarmul.h b/src/include/scalarmul.h
index ecb1782..dab8a99 100644
--- a/src/include/scalarmul.h
+++ b/src/include/scalarmul.h
@@ -30,7 +30,7 @@ typedef word_t scalar_t[SCALAR_WORDS];
  */
 struct fixed_base_table_t {
    /** Comb tables containing multiples of the base point. */
-  struct tw_niels_t *table;
+  tw_niels_a_t *table;
   
   /** Adjustments to the scalar in even and odd cases, respectively. */
   word_t scalar_adjustments[2*SCALAR_WORDS];
@@ -109,7 +109,7 @@ montgomery_ladder (
  */
 void
 scalarmul (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar[SCALAR_WORDS]
     /* TODO? int nbits */
 );
@@ -130,7 +130,7 @@ scalarmul (
  */
 void
 scalarmul_vlook (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar[SCALAR_WORDS]
 );
 
@@ -161,11 +161,11 @@ scalarmul_vlook (
 mask_t
 precompute_fixed_base (
   struct fixed_base_table_t *out,
-  const struct tw_extensible_t *base,
+  const tw_extensible_a_t base,
   unsigned int n,
   unsigned int t,
   unsigned int s,
-  struct tw_niels_t *prealloc
+  tw_niels_a_t *prealloc
 ) __attribute__((warn_unused_result));
 
  /**
@@ -197,7 +197,7 @@ destroy_fixed_base (
  */ 
 mask_t
 scalarmul_fixed_base (
-    struct tw_extensible_t *out,
+    tw_extensible_a_t out,
     const word_t *scalar,
     unsigned int nbits,
     const struct fixed_base_table_t *table
@@ -215,7 +215,7 @@ scalarmul_fixed_base (
  */ 
 void
 scalarmul_vt (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t *scalar,
     unsigned int nbits
 );
@@ -236,8 +236,8 @@ scalarmul_vt (
  */
 mask_t
 precompute_fixed_base_wnaf (
-    struct tw_niels_t *out,
-    const struct tw_extensible_t *base,
+    tw_niels_a_t *out,
+    const tw_extensible_a_t base,
     unsigned int tbits
 ) __attribute__((warn_unused_result));
 
@@ -256,10 +256,10 @@ precompute_fixed_base_wnaf (
  */ 
 void
 scalarmul_fixed_base_wnaf_vt (
-    struct tw_extensible_t *out,
+    tw_extensible_a_t out,
     const word_t *scalar,
     unsigned int nbits,
-    const struct tw_niels_t *precmp,
+    const tw_niels_a_t *precmp,
     unsigned int table_bits
 );
 
@@ -281,12 +281,12 @@ scalarmul_fixed_base_wnaf_vt (
  */ 
 void
 linear_combo_var_fixed_vt (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar_var[SCALAR_WORDS],
     unsigned int nbits_var,
     const word_t scalar_pre[SCALAR_WORDS],
     unsigned int nbits_pre,
-    const struct tw_niels_t *precmp,
+    const tw_niels_a_t *precmp,
     unsigned int table_bits_pre
 );
 
@@ -309,7 +309,7 @@ linear_combo_var_fixed_vt (
  */
 mask_t
 linear_combo_combs_vt (
-    struct tw_extensible_t *out,
+    tw_extensible_a_t out,
     const word_t scalar1[SCALAR_WORDS],
     unsigned int nbits1,
     const struct fixed_base_table_t *table1,
diff --git a/src/include/sha512.h b/src/include/sha512.h
index 760e31e..807f73d 100644
--- a/src/include/sha512.h
+++ b/src/include/sha512.h
@@ -17,28 +17,28 @@ extern "C" {
  *
  * This structure is opaque.
  */
-struct sha512_ctx_t {
+typedef struct {
     /** @privatesection */
     uint64_t chain[8];
     uint8_t block[128];
     uint64_t nbytes;
-};
+} sha512_ctx_a_t[1];
 
 void
 sha512_init (
-    struct sha512_ctx_t *ctx
+    sha512_ctx_a_t ctx
 );
 
 void
 sha512_update (
-    struct sha512_ctx_t *ctx,
+    sha512_ctx_a_t ctx,
     const unsigned char *data,
     uint64_t bytes
 );
     
 void
 sha512_final (
-    struct sha512_ctx_t *ctx,
+    sha512_ctx_a_t ctx,
     uint8_t result[SHA512_OUTPUT_BYTES]
 );
     
diff --git a/src/p448/magic.c b/src/p448/magic.c
index 20c5fa5..74b433c 100644
--- a/src/p448/magic.c
+++ b/src/p448/magic.c
@@ -33,7 +33,7 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
     U64LE(0x0000000000000000)
 };
 
-const struct affine_t goldilocks_base_point = {
+const affine_a_t goldilocks_base_point = {{
 #ifdef USE_NEON_PERM
     {{{ 0xaed939f,0xc59d070,0xf0de840,0x5f065c3, 0xf4ba0c7,0xdf73324,0xc170033,0x3a6a26a,
        0x4c63d96,0x4609845,0xf3932d9,0x1b4faff, 0x6147eaa,0xa2692ff,0x9cecfa9,0x297ea0e
@@ -46,7 +46,7 @@ const struct affine_t goldilocks_base_point = {
     }}},
 #endif
     {{{ 19 }}}
-};
+}};
 
 static const word_t curve_prime_order_lo[(224+WORD_BITS-1)/WORD_BITS] = {
     U64LE(0xdc873d6d54a7bb0d),
diff --git a/src/p480/magic.c b/src/p480/magic.c
index 8615071..7ae8304 100644
--- a/src/p480/magic.c
+++ b/src/p480/magic.c
@@ -35,7 +35,7 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
     0x00000000
 };
 
-const struct affine_t goldilocks_base_point = {
+const affine_a_t goldilocks_base_point = {{
     {{{
         U60LE(0x849ff7f845c30d3),
         U60LE(0x7dda488553a4c5b),
@@ -47,7 +47,7 @@ const struct affine_t goldilocks_base_point = {
         U60LE(0x7ca42af3d564280)
     }}},
     {{{ 5 }}}
-};
+}};
 
 static const word_t curve_prime_order_lo[(240+WORD_BITS-1)/WORD_BITS] = {
     U64LE(0x72e70941cf8da597),
diff --git a/src/p521/magic.c b/src/p521/magic.c
index f8ab264..4613958 100644
--- a/src/p521/magic.c
+++ b/src/p521/magic.c
@@ -38,7 +38,7 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
     0x0
 };
 
-const struct affine_t goldilocks_base_point = {
+const affine_a_t goldilocks_base_point = {{
     {{{
 #ifdef USE_P521_3x3_TRANSPOSE
         U58LE(0x02a940a2f19ba6c),
@@ -66,7 +66,7 @@ const struct affine_t goldilocks_base_point = {
 #endif
     }}},
     {{{ 12 }}}
-};
+}};
 
 static const word_t curve_prime_order_lo[(261+WORD_BITS-1)/WORD_BITS] = {
     U64LE(0xbf15dbca0ae7f295),
diff --git a/src/scalarmul.c b/src/scalarmul.c
index cf95984..12925b2 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -21,8 +21,8 @@ montgomery_ladder (
     unsigned int nbits,
     unsigned int n_extra_doubles
 ) { 
-    struct montgomery_t mont;
-    deserialize_montgomery(&mont, in);
+    montgomery_a_t mont;
+    deserialize_montgomery(mont, in);
     
     int i,j,n=(nbits-1)%WORD_BITS;
     mask_t pflip = 0;
@@ -30,29 +30,29 @@ montgomery_ladder (
         word_t w = scalar[j];
         for (i=n; i>=0; i--) {
             mask_t flip = -((w>>i)&1);
-            constant_time_cond_swap(mont.xa,mont.xd,sizeof(mont.xd),flip^pflip);
-            constant_time_cond_swap(mont.za,mont.zd,sizeof(mont.xd),flip^pflip);
-            montgomery_step(&mont);
+            constant_time_cond_swap(mont->xa,mont->xd,sizeof(mont->xd),flip^pflip);
+            constant_time_cond_swap(mont->za,mont->zd,sizeof(mont->xd),flip^pflip);
+            montgomery_step(mont);
             pflip = flip;
         }
         n = WORD_BITS-1;
     }
-    constant_time_cond_swap(mont.xa,mont.xd,sizeof(mont.xd),pflip);
-    constant_time_cond_swap(mont.za,mont.zd,sizeof(mont.xd),pflip);
+    constant_time_cond_swap(mont->xa,mont->xd,sizeof(mont->xd),pflip);
+    constant_time_cond_swap(mont->za,mont->zd,sizeof(mont->xd),pflip);
     
     assert(n_extra_doubles < INT_MAX);
     for (j=0; j<(int)n_extra_doubles; j++) {
-        montgomery_step(&mont);
+        montgomery_step(mont);
     }
     
-    return serialize_montgomery(out, &mont, in);
+    return serialize_montgomery(out, mont, in);
 }
 
 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_lookup_tw_pniels (
-    struct tw_pniels_t *out,
-    const struct tw_pniels_t *in,
+    tw_pniels_a_t out,
+    const tw_pniels_a_t *in,
     int nin,
     int idx
 ) {
@@ -62,8 +62,8 @@ constant_time_lookup_tw_pniels (
 static __inline__ void
 __attribute__((unused,always_inline))
 constant_time_lookup_tw_niels (
-    struct tw_niels_t *out,
-    const struct tw_niels_t *in,
+    tw_niels_a_t out,
+    const tw_niels_a_t *in,
     int nin,
     int idx
 ) {
@@ -73,8 +73,8 @@ constant_time_lookup_tw_niels (
 /*
 static __inline__ void
 constant_time_lookup_tw_pniels (
-    struct tw_pniels_t *out,
-    const struct tw_pniels_t *in,
+    tw_pniels_a_t out,
+    const tw_pniels_a_t in,
     int nin,
     int idx
 ) {
@@ -95,8 +95,8 @@ constant_time_lookup_tw_pniels (
 
 static __inline__ void
 constant_time_lookup_tw_niels (
-    struct tw_niels_t *out,
-    const struct tw_niels_t *in,
+    tw_niels_a_t out,
+    const tw_niels_a_t in,
     int nin,
     int idx
 ) {
@@ -145,7 +145,7 @@ convert_to_signed_window_form (
 
 void
 scalarmul (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar[SCALAR_WORDS]
 ) {
     const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
@@ -159,20 +159,20 @@ scalarmul (
         SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
     );
 
-    struct tw_extensible_t tabulator;
-    copy_tw_extensible(&tabulator, working);
-    double_tw_extensible(&tabulator);
+    tw_extensible_a_t tabulator;
+    copy_tw_extensible(tabulator, working);
+    double_tw_extensible(tabulator);
 
-    struct tw_pniels_t
-	pn VECTOR_ALIGNED,
-	multiples[NTABLE] VECTOR_ALIGNED;
-    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
-    convert_tw_extensible_to_tw_pniels(&multiples[0], working);
+    tw_pniels_a_t
+	  pn VECTOR_ALIGNED,
+	  multiples[NTABLE] VECTOR_ALIGNED;
+    convert_tw_extensible_to_tw_pniels(pn, tabulator);
+    convert_tw_extensible_to_tw_pniels(multiples[0], working);
 
     int i,j;
     for (i=1; i<NTABLE; i++) {
-        add_tw_pniels_to_tw_extensible(working, &pn);
-        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
+        add_tw_pniels_to_tw_extensible(working, pn);
+        convert_tw_extensible_to_tw_pniels(multiples[i], working);
     }
 
     i = nbits - WINDOW;
@@ -180,9 +180,9 @@ scalarmul (
         inv = (bits>>(WINDOW-1))-1;
     bits ^= inv;
     
-    constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
-    cond_negate_tw_pniels(&pn, inv);
-    convert_tw_pniels_to_tw_extensible(working, &pn);
+    constant_time_lookup_tw_pniels(pn, multiples, NTABLE, bits & WINDOW_T_MASK);
+    cond_negate_tw_pniels(pn, inv);
+    convert_tw_pniels_to_tw_extensible(working, pn);
 		
 
     for (i-=WINDOW; i>=0; i-=WINDOW) {
@@ -200,15 +200,15 @@ scalarmul (
         inv = (bits>>(WINDOW-1))-1;
         bits ^= inv;
     
-        constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
-        cond_negate_tw_pniels(&pn, inv);
-        add_tw_pniels_to_tw_extensible(working, &pn);
+        constant_time_lookup_tw_pniels(pn, multiples, NTABLE, bits & WINDOW_T_MASK);
+        cond_negate_tw_pniels(pn, inv);
+        add_tw_pniels_to_tw_extensible(working, pn);
     }
 }
 
 void
 scalarmul_vlook (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar[SCALAR_WORDS]
 ) {    
     const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
@@ -223,20 +223,20 @@ scalarmul_vlook (
     );
 
 
-    struct tw_extensible_t tabulator;
-    copy_tw_extensible(&tabulator, working);
-    double_tw_extensible(&tabulator);
+    tw_extensible_a_t tabulator;
+    copy_tw_extensible(tabulator, working);
+    double_tw_extensible(tabulator);
 
-    struct tw_pniels_t
-	pn VECTOR_ALIGNED,
-	multiples[NTABLE] VECTOR_ALIGNED;
-    convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
-    convert_tw_extensible_to_tw_pniels(&multiples[0], working);
+    tw_pniels_a_t
+	  pn VECTOR_ALIGNED,
+	  multiples[NTABLE] VECTOR_ALIGNED;
+    convert_tw_extensible_to_tw_pniels(pn, tabulator);
+    convert_tw_extensible_to_tw_pniels(multiples[0], working);
 
     int i,j;
     for (i=1; i<NTABLE; i++) {
-        add_tw_pniels_to_tw_extensible(working, &pn);
-        convert_tw_extensible_to_tw_pniels(&multiples[i], working);
+        add_tw_pniels_to_tw_extensible(working, pn);
+        convert_tw_extensible_to_tw_pniels(multiples[i], working);
     }
 
     i = nbits - WINDOW;
@@ -244,9 +244,9 @@ scalarmul_vlook (
         inv = (bits>>(WINDOW-1))-1;
     bits ^= inv;
 
-    copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
-    cond_negate_tw_pniels(&pn, inv);
-    convert_tw_pniels_to_tw_extensible(working, &pn);
+    copy_tw_pniels(pn, multiples[bits & WINDOW_T_MASK]);
+    cond_negate_tw_pniels(pn, inv);
+    convert_tw_pniels_to_tw_extensible(working, pn);
 		
 
     for (i-=WINDOW; i>=0; i-=WINDOW) {
@@ -264,9 +264,9 @@ scalarmul_vlook (
         inv = (bits>>(WINDOW-1))-1;
         bits ^= inv;
     
-        copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
-        cond_negate_tw_pniels(&pn, inv);
-        add_tw_pniels_to_tw_extensible(working, &pn);
+        copy_tw_pniels(pn, multiples[bits & WINDOW_T_MASK]);
+        cond_negate_tw_pniels(pn, inv);
+        add_tw_pniels_to_tw_extensible(working, pn);
     }
 }
 
@@ -275,7 +275,7 @@ schedule_scalar_for_combs (
     word_t *scalar2,
     const word_t *scalar,
     unsigned int nbits,
-    const struct fixed_base_table_t *table
+    const struct fixed_base_table_t* table
 ) {
     unsigned int i;
     unsigned int n = table->n, t = table->t, s = table->s;
@@ -312,10 +312,10 @@ schedule_scalar_for_combs (
 
 mask_t
 scalarmul_fixed_base (
-    struct tw_extensible_t *out,
+    tw_extensible_a_t out,
     const word_t scalar[SCALAR_WORDS],
     unsigned int nbits,
-    const struct fixed_base_table_t *table
+    const struct fixed_base_table_t* table
 ) {
     unsigned int i,j,k;
     unsigned int n = table->n, t = table->t, s = table->s;
@@ -332,7 +332,7 @@ scalarmul_fixed_base (
     assert(t >= 1);
 #endif
     
-    struct tw_niels_t ni;
+    tw_niels_a_t ni;
     
     for (i=0; i<s; i++) {
         if (i) double_tw_extensible(out);
@@ -355,12 +355,12 @@ scalarmul_fixed_base (
             tab ^= invert;
             tab &= (1<<(t-1)) - 1;
             
-            constant_time_lookup_tw_niels(&ni, table->table + (j<<(t-1)), 1<<(t-1), tab);
-            cond_negate_tw_niels(&ni, invert);
+            constant_time_lookup_tw_niels(ni, table->table + (j<<(t-1)), 1<<(t-1), tab);
+            cond_negate_tw_niels(ni, invert);
             if (i||j) {
-                add_tw_niels_to_tw_extensible(out, &ni);
+                add_tw_niels_to_tw_extensible(out, ni);
             } else {
-                convert_tw_niels_to_tw_extensible(out, &ni);
+                convert_tw_niels_to_tw_extensible(out, ni);
             }
         }
     }
@@ -370,13 +370,13 @@ scalarmul_fixed_base (
 
 mask_t
 linear_combo_combs_vt (
-    struct tw_extensible_t *out,
+    tw_extensible_a_t out,
     const word_t scalar1[SCALAR_WORDS],
     unsigned int nbits1,
-    const struct fixed_base_table_t *table1,
+    const struct fixed_base_table_t* table1,
     const word_t scalar2[SCALAR_WORDS],
     unsigned int nbits2,
-    const struct fixed_base_table_t *table2
+    const struct fixed_base_table_t* table2
 ) { 
     unsigned int i,j,k,sc;
     unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
@@ -402,7 +402,7 @@ linear_combo_combs_vt (
     assert(table2->t >= 1);
 #endif
   
-    struct tw_niels_t ni;
+    tw_niels_a_t ni;
     
     unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0;
     word_t *scalars[2] = {scalar1b,scalar2b};
@@ -411,7 +411,7 @@ linear_combo_combs_vt (
         if (i) double_tw_extensible(out);
             
         for (sc=0; sc<2; sc++) {
-            const struct fixed_base_table_t *table = sc ? table2 : table1;
+            const struct fixed_base_table_t* table = sc ? table2 : table1;
             
             int ii = i-smax+table->s;
             if (ii < 0) continue;
@@ -432,13 +432,13 @@ linear_combo_combs_vt (
                 tab ^= invert;
                 tab &= (1<<(table->t-1)) - 1;
             
-                copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]);
-                cond_negate_tw_niels(&ni,invert);
+                copy_tw_niels(ni, table->table[tab + (j<<(table->t-1))]);
+                cond_negate_tw_niels(ni,invert);
                 
                 if (started) {
-                    add_tw_niels_to_tw_extensible(out, &ni);
+                    add_tw_niels_to_tw_extensible(out, ni);
                 } else {
-                    convert_tw_niels_to_tw_extensible(out, &ni);
+                    convert_tw_niels_to_tw_extensible(out, ni);
                     started = 1;
                 }
             
@@ -454,12 +454,12 @@ linear_combo_combs_vt (
 
 mask_t
 precompute_fixed_base (
-  struct fixed_base_table_t *out,
-  const struct tw_extensible_t *base,
+  struct fixed_base_table_t* out,
+  const tw_extensible_a_t base,
   unsigned int n,
   unsigned int t,
   unsigned int s,
-  struct tw_niels_t *prealloc
+  tw_niels_a_t *prealloc
 ) {
     if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) {
         really_memset(out, 0, sizeof(*out));
@@ -470,19 +470,19 @@ precompute_fixed_base (
     out->t = t;
     out->s = s;
   
-    struct tw_extensible_t working, start;
-    copy_tw_extensible(&working, base);
-    struct tw_pniels_t pn_tmp;
+    tw_extensible_a_t working, start;
+    copy_tw_extensible(working, base);
+    tw_pniels_a_t pn_tmp;
   
-    struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
+    tw_pniels_a_t *doubles = (tw_pniels_a_t *) malloc_vector(sizeof(*doubles) * (t-1));
     field_a_t *zs  = (field_a_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
     field_a_t *zis = (field_a_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
     
-    struct tw_niels_t *table = prealloc;
+    tw_niels_a_t *table = prealloc;
     if (prealloc) {
         out->own_table = 0;
     } else {
-        table = (struct tw_niels_t *) malloc_vector(sizeof(*table) * (n<<(t-1)));
+        table = (tw_niels_a_t *) malloc_vector(sizeof(*table) * (n<<(t-1)));
         out->own_table = 1;
     }
     out->table = table;
@@ -535,23 +535,23 @@ precompute_fixed_base (
         /* doubling phase */
         for (j=0; j<t; j++) {
             if (j) {
-                convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
-                add_tw_pniels_to_tw_extensible(&start, &pn_tmp);
+                convert_tw_extensible_to_tw_pniels(pn_tmp, working);
+                add_tw_pniels_to_tw_extensible(start, pn_tmp);
             } else {
-                copy_tw_extensible(&start, &working);
+                copy_tw_extensible(start, working);
             }
 
             if (j==t-1 && i==n-1) {
                 break;
             }
 
-            double_tw_extensible(&working);
+            double_tw_extensible(working);
             if (j<t-1) {
-                convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
+                convert_tw_extensible_to_tw_pniels(doubles[j], working);
             }
 
             for (k=0; k<s-1; k++) {
-                double_tw_extensible(&working);
+                double_tw_extensible(working);
             }
         }
 
@@ -560,9 +560,9 @@ precompute_fixed_base (
             int gray = j ^ (j>>1);
             int idx = (((i+1)<<(t-1))-1) ^ gray;
 
-            convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
-            copy_tw_niels(&table[idx], &pn_tmp.n);
-            field_copy(zs[idx], pn_tmp.z);
+            convert_tw_extensible_to_tw_pniels(pn_tmp, start);
+            copy_tw_niels(table[idx], pn_tmp->n);
+            field_copy(zs[idx], pn_tmp->z);
 			
             if (j >= (1u<<(t-1)) - 1) break;
             int delta = (j+1) ^ ((j+1)>>1) ^ gray;
@@ -572,10 +572,10 @@ precompute_fixed_base (
             
             if (gray & (1<<k)) {
                 /* start += doubles[k] */
-                add_tw_pniels_to_tw_extensible(&start, &doubles[k]);
+                add_tw_pniels_to_tw_extensible(start, doubles[k]);
             } else {
                 /* start -= doubles[k] */
-                sub_tw_pniels_from_tw_extensible(&start, &doubles[k]);
+                sub_tw_pniels_from_tw_extensible(start, doubles[k]);
             }
             
             
@@ -586,17 +586,17 @@ precompute_fixed_base (
 
     field_a_t product;
     for (i=0; i<n<<(t-1); i++) {
-        field_mul(product, table[i].a, zis[i]);
+        field_mul(product, table[i]->a, zis[i]);
         field_strong_reduce(product);
-        field_copy(table[i].a, product);
+        field_copy(table[i]->a, product);
         
-        field_mul(product, table[i].b, zis[i]);
+        field_mul(product, table[i]->b, zis[i]);
         field_strong_reduce(product);
-        field_copy(table[i].b, product);
+        field_copy(table[i]->b, product);
         
-        field_mul(product, table[i].c, zis[i]);
+        field_mul(product, table[i]->c, zis[i]);
         field_strong_reduce(product);
-        field_copy(table[i].c, product);
+        field_copy(table[i]->c, product);
     }
 	
 	mask_t ret = ~field_is_zero(zis[0]);
@@ -617,7 +617,7 @@ precompute_fixed_base (
 
 void
 destroy_fixed_base (
-    struct fixed_base_table_t *table
+    struct fixed_base_table_t* table
 ) {
     if (table->table) {
         really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
@@ -630,8 +630,8 @@ destroy_fixed_base (
 
 mask_t
 precompute_fixed_base_wnaf (
-    struct tw_niels_t *out,
-    const struct tw_extensible_t *const_base,
+    tw_niels_a_t *out,
+    const tw_extensible_a_t const_base,
     unsigned int tbits
 ) {
     int i;
@@ -644,29 +644,29 @@ precompute_fixed_base_wnaf (
         return 0;
     }
 
-    struct tw_extensible_t base;
-    copy_tw_extensible(&base,const_base);
+    tw_extensible_a_t base;
+    copy_tw_extensible(base,const_base);
     
-    struct tw_pniels_t twop, tmp;
+    tw_pniels_a_t twop, tmp;
     
-    convert_tw_extensible_to_tw_pniels(&tmp, &base);
-    field_copy(zs[0], tmp.z);
-    copy_tw_niels(&out[0], &tmp.n);
+    convert_tw_extensible_to_tw_pniels(tmp, base);
+    field_copy(zs[0], tmp->z);
+    copy_tw_niels(out[0], tmp->n);
 
     if (tbits > 0) {
-        double_tw_extensible(&base);
-        convert_tw_extensible_to_tw_pniels(&twop, &base);
-        add_tw_pniels_to_tw_extensible(&base, &tmp);
+        double_tw_extensible(base);
+        convert_tw_extensible_to_tw_pniels(twop, base);
+        add_tw_pniels_to_tw_extensible(base, tmp);
         
-        convert_tw_extensible_to_tw_pniels(&tmp, &base);
-        field_copy(zs[1], tmp.z);
-        copy_tw_niels(&out[1], &tmp.n);
+        convert_tw_extensible_to_tw_pniels(tmp, base);
+        field_copy(zs[1], tmp->z);
+        copy_tw_niels(out[1], tmp->n);
 
         for (i=2; i < 1<<tbits; i++) {
-            add_tw_pniels_to_tw_extensible(&base, &twop);
-            convert_tw_extensible_to_tw_pniels(&tmp, &base);
-            field_copy(zs[i], tmp.z);
-            copy_tw_niels(&out[i], &tmp.n);
+            add_tw_pniels_to_tw_extensible(base, twop);
+            convert_tw_extensible_to_tw_pniels(tmp, base);
+            field_copy(zs[i], tmp->z);
+            copy_tw_niels(out[i], tmp->n);
         }
     }
     
@@ -674,17 +674,17 @@ precompute_fixed_base_wnaf (
 
     field_a_t product;
     for (i=0; i<1<<tbits; i++) {
-        field_mul(product, out[i].a, zis[i]);
+        field_mul(product, out[i]->a, zis[i]);
         field_strong_reduce(product);
-        field_copy(out[i].a, product);
+        field_copy(out[i]->a, product);
         
-        field_mul(product, out[i].b, zis[i]);
+        field_mul(product, out[i]->b, zis[i]);
         field_strong_reduce(product);
-        field_copy(out[i].b, product);
+        field_copy(out[i]->b, product);
         
-        field_mul(product, out[i].c, zis[i]);
+        field_mul(product, out[i]->c, zis[i]);
         field_strong_reduce(product);
-        field_copy(out[i].c, product);
+        field_copy(out[i]->c, product);
     }
 
     free(zs);
@@ -760,31 +760,31 @@ recode_wnaf(
 
 static void
 prepare_wnaf_table(
-    struct tw_pniels_t *output,
-    struct tw_extensible_t *working,
+    tw_pniels_a_t *output,
+    tw_extensible_a_t working,
     unsigned int tbits
 ) {
     int i;
-    convert_tw_extensible_to_tw_pniels(&output[0], working);
+    convert_tw_extensible_to_tw_pniels(output[0], working);
 
     if (tbits == 0) return;
 
     double_tw_extensible(working);
-    struct tw_pniels_t twop;
-    convert_tw_extensible_to_tw_pniels(&twop, working);
+    tw_pniels_a_t twop;
+    convert_tw_extensible_to_tw_pniels(twop, working);
 
-    add_tw_pniels_to_tw_extensible(working, &output[0]);
-    convert_tw_extensible_to_tw_pniels(&output[1], working);
+    add_tw_pniels_to_tw_extensible(working, output[0]);
+    convert_tw_extensible_to_tw_pniels(output[1], working);
 
     for (i=2; i < 1<<tbits; i++) {
-        add_tw_pniels_to_tw_extensible(working, &twop);
-        convert_tw_extensible_to_tw_pniels(&output[i], working);
+        add_tw_pniels_to_tw_extensible(working, twop);
+        convert_tw_extensible_to_tw_pniels(output[i], working);
     }
 }
 
 void
 scalarmul_vt (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar[SCALAR_WORDS],
     unsigned int nbits
 ) {
@@ -793,13 +793,13 @@ scalarmul_vt (
     
     int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
   
-    struct tw_pniels_t precmp[1<<table_bits];
+    tw_pniels_a_t precmp[1<<table_bits];
     prepare_wnaf_table(precmp, working, table_bits);
   
     if (control_bits > 0) {
         assert(control[0].addend > 0);
         assert(control[0].power >= 0);
-        convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
+        convert_tw_pniels_to_tw_extensible(working, precmp[control[0].addend >> 1]);
     } else {
         set_identity_tw_extensible(working);
         return;
@@ -813,9 +813,9 @@ scalarmul_vt (
             assert(control[conti].addend);
 
             if (control[conti].addend > 0) {
-                add_tw_pniels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
+                add_tw_pniels_to_tw_extensible(working, precmp[control[conti].addend >> 1]);
             } else {
-                sub_tw_pniels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
+                sub_tw_pniels_from_tw_extensible(working, precmp[(-control[conti].addend) >> 1]);
             }
             conti++;
             assert(conti <= control_bits);
@@ -825,10 +825,10 @@ scalarmul_vt (
 
 void
 scalarmul_fixed_base_wnaf_vt (
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar[SCALAR_WORDS],
     unsigned int nbits,
-    const struct tw_niels_t *precmp,
+    const tw_niels_a_t *precmp,
     unsigned int table_bits
 ) {
     struct smvt_control control[nbits/(table_bits+1)+3];
@@ -838,7 +838,7 @@ scalarmul_fixed_base_wnaf_vt (
     if (control_bits > 0) {
         assert(control[0].addend > 0);
         assert(control[0].power >= 0);
-        convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
+        convert_tw_niels_to_tw_extensible(working, precmp[control[0].addend >> 1]);
     } else {
         set_identity_tw_extensible(working);
         return;
@@ -853,9 +853,9 @@ scalarmul_fixed_base_wnaf_vt (
         
         assert(control[conti].addend);
         if (control[conti].addend > 0) {
-            add_tw_niels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
+            add_tw_niels_to_tw_extensible(working, precmp[control[conti].addend >> 1]);
         } else {
-            sub_tw_niels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
+            sub_tw_niels_from_tw_extensible(working, precmp[(-control[conti].addend) >> 1]);
         }
     }
 
@@ -866,12 +866,12 @@ scalarmul_fixed_base_wnaf_vt (
 
 void
 linear_combo_var_fixed_vt(
-    struct tw_extensible_t *working,
+    tw_extensible_a_t working,
     const word_t scalar_var[SCALAR_WORDS],
     unsigned int nbits_var,
     const word_t scalar_pre[SCALAR_WORDS],
     unsigned int nbits_pre,
-    const struct tw_niels_t *precmp,
+    const tw_niels_a_t *precmp,
     unsigned int table_bits_pre
 ) {
     const int table_bits_var = SCALARMUL_WNAF_COMBO_TABLE_BITS;
@@ -883,22 +883,22 @@ linear_combo_var_fixed_vt(
     (void)ncb_var;
     (void)ncb_pre;
   
-    struct tw_pniels_t precmp_var[1<<table_bits_var];
+    tw_pniels_a_t precmp_var[1<<table_bits_var];
     prepare_wnaf_table(precmp_var, working, table_bits_var);
   
     int contp=0, contv=0, i;
   
     i = control_var[0].power;
     if (i > control_pre[0].power) {
-        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
+        convert_tw_pniels_to_tw_extensible(working, precmp_var[control_var[0].addend >> 1]);
         contv++;
     } else if (i == control_pre[0].power && i >=0 ) {
-        convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
-        add_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
+        convert_tw_pniels_to_tw_extensible(working, precmp_var[control_var[0].addend >> 1]);
+        add_tw_niels_to_tw_extensible(working, precmp[control_pre[0].addend >> 1]);
         contv++; contp++;
     } else {
         i = control_pre[0].power;
-        convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
+        convert_tw_niels_to_tw_extensible(working, precmp[control_pre[0].addend >> 1]);
         contp++;
     }
     
@@ -914,9 +914,9 @@ linear_combo_var_fixed_vt(
             assert(control_var[contv].addend);
 
             if (control_var[contv].addend > 0) {
-                add_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[contv].addend >> 1]);
+                add_tw_pniels_to_tw_extensible(working, precmp_var[control_var[contv].addend >> 1]);
             } else {
-                sub_tw_pniels_from_tw_extensible(working, &precmp_var[(-control_var[contv].addend) >> 1]);
+                sub_tw_pniels_from_tw_extensible(working, precmp_var[(-control_var[contv].addend) >> 1]);
             }
             contv++;
         }
@@ -925,9 +925,9 @@ linear_combo_var_fixed_vt(
             assert(control_pre[contp].addend);
 
             if (control_pre[contp].addend > 0) {
-                add_tw_niels_to_tw_extensible(working, &precmp[control_pre[contp].addend >> 1]);
+                add_tw_niels_to_tw_extensible(working, precmp[control_pre[contp].addend >> 1]);
             } else {
-                sub_tw_niels_from_tw_extensible(working, &precmp[(-control_pre[contp].addend) >> 1]);
+                sub_tw_niels_from_tw_extensible(working, precmp[(-control_pre[contp].addend) >> 1]);
             }
             contp++;
         }
diff --git a/src/sha512.c b/src/sha512.c
index 82f81ad..9a11bd0 100644
--- a/src/sha512.c
+++ b/src/sha512.c
@@ -72,7 +72,7 @@ static inline uint64_t maj(uint64_t h1, uint64_t h2, uint64_t h3) {
 
 static void
 sha512_process_block (
-    struct sha512_ctx_t *ctx
+    sha512_ctx_a_t ctx
 ) {
     uint64_t i, tmp, a, b,
         *w = (uint64_t *) ctx->block,
@@ -119,7 +119,7 @@ sha512_process_block (
 
 void
 sha512_init (
-    struct sha512_ctx_t *ctx
+    sha512_ctx_a_t ctx
 ) {
     ctx->nbytes = 0;
     memcpy(ctx->chain, sha512_init_state, sizeof(sha512_init_state));
@@ -128,7 +128,7 @@ sha512_init (
 
 void
 sha512_update (
-    struct sha512_ctx_t *ctx,
+    sha512_ctx_a_t ctx,
     const unsigned char *data,
     uint64_t bytes
 ) {
@@ -153,7 +153,7 @@ sha512_update (
 
 void
 sha512_final (
-    struct sha512_ctx_t *ctx,
+    sha512_ctx_a_t ctx,
     uint8_t result[64]
 ) {
     uint64_t fill = ctx->nbytes % 128, i;
diff --git a/test/bench.c b/test/bench.c
index 350415b..2c16c65 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -146,19 +146,19 @@ int main(int argc, char **argv) {
     when = now() - when;
     printf("rand448:     %5.1fns\n", when * 1e9 / i);
     
-    struct sha512_ctx_t sha;
+    sha512_ctx_a_t sha;
     uint8_t hashout[128];
     when = now();
     for (i=0; i<nbase; i++) {
-        sha512_init(&sha);
-        sha512_final(&sha, hashout);
+        sha512_init(sha);
+        sha512_final(sha, hashout);
     }
     when = now() - when;
     printf("sha512 1blk: %5.1fns\n", when * 1e9 / i);
     
     when = now();
     for (i=0; i<nbase; i++) {
-        sha512_update(&sha, hashout, 128);
+        sha512_update(sha, hashout, 128);
     }
     when = now() - when;
     printf("sha512 blk:  %5.1fns (%0.2f MB/s)\n", when * 1e9 / i, 128*i/when/1e6);
@@ -348,7 +348,7 @@ int main(int argc, char **argv) {
     when = now() - when;
     printf("edwards vtm: %5.1fµs\n", when * 1e6 / i);
     
-    struct tw_niels_t wnaft[1<<6];
+    tw_niels_a_t wnaft[1<<6];
     when = now();
     for (i=0; i<nbase/10; i++) {
         ignore_result(precompute_fixed_base_wnaf(wnaft,&ext,6));
diff --git a/test/test_pointops.c b/test/test_pointops.c
index 1e40271..5f0ec09 100644
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -258,12 +258,12 @@ int test_pointops (void) {
     crandom_init_from_buffer(&crand, "test_pointops random initializer");
     
     struct extensible_t ext_base;
-    if (!validate_affine(&goldilocks_base_point)) {
+    if (!validate_affine(goldilocks_base_point)) {
         youfail();
         printf("  Base point isn't on the curve.\n");
         return -1;
     }
-    convert_affine_to_extensible(&ext_base, &goldilocks_base_point);
+    convert_affine_to_extensible(&ext_base, goldilocks_base_point);
     if (!validate_ext(&ext_base, 2, "base")) return -1;
     
     int i, ret;
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
index d21be13..c66558b 100644
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -80,7 +80,7 @@ single_scalarmul_compatibility_test (
     
     /* compute using precomp wNAF */
     for (i=0; i<nsizes; i++) {
-        struct tw_niels_t pre[1<<i];
+        tw_niels_a_t pre[1<<i];
         
         succ = precompute_fixed_base_wnaf(pre, &text, i);
         if (!succ) {
@@ -177,7 +177,7 @@ single_linear_combo_test (
     if (!succ) return 1;
     
     struct fixed_base_table_t t1, t2;
-    struct tw_niels_t wnaf[32];
+    tw_niels_a_t wnaf[32];
     memset(&t1,0,sizeof(t1));
     memset(&t2,0,sizeof(t2));
     
diff --git a/test/test_sha512.c b/test/test_sha512.c
index 081ddf3..815bb5f 100644
--- a/test/test_sha512.c
+++ b/test/test_sha512.c
@@ -11,8 +11,8 @@ static int sha512_monte_carlo_core (
     const char *seed,
     const char *checks[100]
 ) { 
-    struct sha512_ctx_t sha;
-    sha512_init(&sha);
+    sha512_ctx_a_t sha;
+    sha512_init(sha);
     
     unsigned char md0[64],md1[64],md2[64];
     
@@ -31,12 +31,12 @@ static int sha512_monte_carlo_core (
     for (j=0; j<100; j++) {
         
         for (i=3; i<1003; i++) {
-            sha512_update(&sha,md0,sizeof(md0));
-            sha512_update(&sha,md1,sizeof(md1));
-            sha512_update(&sha,md2,sizeof(md2));
+            sha512_update(sha,md0,sizeof(md0));
+            sha512_update(sha,md1,sizeof(md1));
+            sha512_update(sha,md2,sizeof(md2));
             memcpy(md0,md1,sizeof(md1));
             memcpy(md1,md2,sizeof(md1));
-            sha512_final(&sha,md2);
+            sha512_final(sha,md2);
         }
         
         ret = hexdecode(md0,checks[j],64);

From 5965624458d2532228f22c777c36c83467465f6d Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 17:45:36 -0800
Subject: [PATCH 06/15] fix performance regression for montgomery ladder

---
 src/ec_point.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ec_point.c b/src/ec_point.c
index 2582372..613a12e 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -249,20 +249,20 @@ montgomery_step (
     ANALYZE_THIS_ROUTINE_CAREFULLY;
     field_a_t L0, L1;
     field_add_nr ( L0, a->zd, a->xd );
-    field_sub ( L1, a->xd, a->zd );
-    field_sub ( a->zd, a->xa, a->za );
+    field_subx_nr ( L1, a->xd, a->zd );
+    field_subx_nr ( a->zd, a->xa, a->za );
     field_mul ( a->xd, L0, a->zd );
     field_add_nr ( a->zd, a->za, a->xa );
     field_mul ( a->za, L1, a->zd );
     field_add_nr ( a->xa, a->za, a->xd );
     field_sqr ( a->zd, a->xa );
     field_mul ( a->xa, a->z0, a->zd );
-    field_sub ( a->zd, a->xd, a->za );
+    field_subx_nr ( a->zd, a->xd, a->za );
     field_sqr ( a->za, a->zd );
     field_sqr ( a->xd, L0 );
     field_sqr ( L0, L1 );
     field_mulw_scc ( a->zd, a->xd, 1-EDWARDS_D ); /* FIXME PERF MULW */
-    field_sub ( L1, a->xd, L0 );
+    field_subx_nr ( L1, a->xd, L0 );
     field_mul ( a->xd, L0, a->zd );
     field_sub_nr ( L0, a->zd, L1 );
     field_bias ( L0, 4 - 2*is32 /*is32 ? 2 : 4*/ );

From d2e5e0fe8ccb343f5e5675f388b49a3ebe454640 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 18:06:35 -0800
Subject: [PATCH 07/15] const** related fixes for gcc

---
 src/goldilocks.c      |  2 +-
 src/scalarmul.c       | 56 ++++---------------------------------------
 test/bench.c          | 10 ++++----
 test/test_scalarmul.c |  4 ++--
 4 files changed, 13 insertions(+), 59 deletions(-)

diff --git a/src/goldilocks.c b/src/goldilocks.c
index 1c647f4..32c9a12 100644
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -462,7 +462,7 @@ goldilocks_verify (
     linear_combo_var_fixed_vt( pk_text,
         challenge, GOLDI_SCALAR_BITS,
         s, GOLDI_SCALAR_BITS,
-        goldilocks_global.wnafs, WNAF_PRECMP_BITS );
+        (const tw_niels_a_t*)goldilocks_global.wnafs, WNAF_PRECMP_BITS );
     
     untwist_and_double_and_serialize( pk, pk_text );
 
diff --git a/src/scalarmul.c b/src/scalarmul.c
index 12925b2..af7f72a 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -70,52 +70,6 @@ constant_time_lookup_tw_niels (
     constant_time_lookup(out,in,sizeof(*out),nin,idx);
 }
 
-/*
-static __inline__ void
-constant_time_lookup_tw_pniels (
-    tw_pniels_a_t out,
-    const tw_pniels_a_t in,
-    int nin,
-    int idx
-) {
-    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
-    big_register_t *o = (big_register_t *)out;
-    const big_register_t *i = (const big_register_t *)in;
-    int j;
-    unsigned int k;
-    
-    really_memset(out, 0, sizeof(*out));
-    for (j=0; j<nin; j++, big_i-=big_one) {
-        big_register_t mask = br_is_zero(big_i);
-        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
-            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
-        }
-    }
-}
-
-static __inline__ void
-constant_time_lookup_tw_niels (
-    tw_niels_a_t out,
-    const tw_niels_a_t in,
-    int nin,
-    int idx
-) {
-    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
-    big_register_t *o = (big_register_t *)out;
-    const big_register_t *i = (const big_register_t *)in;
-    int j;
-    unsigned int k;
-    
-    really_memset(out, 0, sizeof(*out));
-    for (j=0; j<nin; j++, big_i-=big_one) {
-        big_register_t mask = br_is_zero(big_i);
-        for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
-            o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
-        }
-    }
-}
-*/
-
 static void
 convert_to_signed_window_form (
     word_t *out,
@@ -180,7 +134,7 @@ scalarmul (
         inv = (bits>>(WINDOW-1))-1;
     bits ^= inv;
     
-    constant_time_lookup_tw_pniels(pn, multiples, NTABLE, bits & WINDOW_T_MASK);
+    constant_time_lookup_tw_pniels(pn, (const tw_pniels_a_t*)multiples, NTABLE, bits & WINDOW_T_MASK);
     cond_negate_tw_pniels(pn, inv);
     convert_tw_pniels_to_tw_extensible(working, pn);
 		
@@ -200,7 +154,7 @@ scalarmul (
         inv = (bits>>(WINDOW-1))-1;
         bits ^= inv;
     
-        constant_time_lookup_tw_pniels(pn, multiples, NTABLE, bits & WINDOW_T_MASK);
+        constant_time_lookup_tw_pniels(pn, (const tw_pniels_a_t*)multiples, NTABLE, bits & WINDOW_T_MASK);
         cond_negate_tw_pniels(pn, inv);
         add_tw_pniels_to_tw_extensible(working, pn);
     }
@@ -355,7 +309,7 @@ scalarmul_fixed_base (
             tab ^= invert;
             tab &= (1<<(t-1)) - 1;
             
-            constant_time_lookup_tw_niels(ni, table->table + (j<<(t-1)), 1<<(t-1), tab);
+            constant_time_lookup_tw_niels(ni, (const tw_niels_a_t*)table->table + (j<<(t-1)), 1<<(t-1), tab);
             cond_negate_tw_niels(ni, invert);
             if (i||j) {
                 add_tw_niels_to_tw_extensible(out, ni);
@@ -582,7 +536,7 @@ precompute_fixed_base (
         }
     }
 	
-    field_simultaneous_invert(zis, zs, n<<(t-1));
+    field_simultaneous_invert(zis, (const field_a_t*)zs, n<<(t-1));
 
     field_a_t product;
     for (i=0; i<n<<(t-1); i++) {
@@ -670,7 +624,7 @@ precompute_fixed_base_wnaf (
         }
     }
     
-    field_simultaneous_invert(zis, zs, 1<<tbits);
+    field_simultaneous_invert(zis, (const field_a_t *)zs, 1<<tbits);
 
     field_a_t product;
     for (i=0; i<1<<tbits; i++) {
diff --git a/test/bench.c b/test/bench.c
index 2c16c65..1af09d0 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -359,7 +359,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         q448_randomize(&crand, sk);
-        scalarmul_fixed_base_wnaf_vt(&ext,sk,SCALAR_BITS,wnaft,6);
+        scalarmul_fixed_base_wnaf_vt(&ext,sk,SCALAR_BITS,(const tw_niels_a_t*)wnaft,6);
     }
     when = now() - when;
     printf("edwards vt6: %5.1fµs\n", when * 1e6 / i);
@@ -374,7 +374,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         q448_randomize(&crand, sk);
-        scalarmul_fixed_base_wnaf_vt(&ext,sk,SCALAR_BITS,wnaft,4);
+        scalarmul_fixed_base_wnaf_vt(&ext,sk,SCALAR_BITS,(const tw_niels_a_t*)wnaft,4);
     }
     when = now() - when;
     printf("edwards vt4: %5.1fµs\n", when * 1e6 / i);
@@ -389,7 +389,7 @@ int main(int argc, char **argv) {
     when = now();
     for (i=0; i<nbase/10; i++) {
         q448_randomize(&crand, sk);
-        scalarmul_fixed_base_wnaf_vt(&ext,sk,SCALAR_BITS,wnaft,5);
+        scalarmul_fixed_base_wnaf_vt(&ext,sk,SCALAR_BITS,(const tw_niels_a_t*)wnaft,5);
     }
     when = now() - when;
     printf("edwards vt5: %5.1fµs\n", when * 1e6 / i);
@@ -398,7 +398,7 @@ int main(int argc, char **argv) {
     for (i=0; i<nbase/10; i++) {
         q448_randomize(&crand, sk);
         q448_randomize(&crand, tk);
-        linear_combo_var_fixed_vt(&ext,sk,FIELD_BITS,tk,FIELD_BITS,wnaft,5);
+        linear_combo_var_fixed_vt(&ext,sk,FIELD_BITS,tk,FIELD_BITS,(const tw_niels_a_t*)wnaft,5);
     }
     when = now() - when;
     printf("vt vf combo: %5.1fµs\n", when * 1e6 / i);
@@ -708,7 +708,7 @@ int main(int argc, char **argv) {
         serialize_extensible(b, &exta);
 
         ignore_result(precompute_fixed_base_wnaf(wnaft,&exu,5));
-        linear_combo_var_fixed_vt(&ext,sk,FIELD_BITS,tk,FIELD_BITS,wnaft,5);
+        linear_combo_var_fixed_vt(&ext,sk,FIELD_BITS,tk,FIELD_BITS,(const tw_niels_a_t*)wnaft,5);
         untwist_and_double(&exta,&exv);
         serialize_extensible(c, &exta);
         
diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c
index c66558b..e38f488 100644
--- a/test/test_scalarmul.c
+++ b/test/test_scalarmul.c
@@ -89,7 +89,7 @@ single_scalarmul_compatibility_test (
             continue;
         }
         
-        scalarmul_fixed_base_wnaf_vt(&work, scalar, nbits, pre, i);
+        scalarmul_fixed_base_wnaf_vt(&work, scalar, nbits, (const tw_niels_a_t*)pre, i);
         
         untwist_and_double_and_serialize(wout[i], &work);
     }
@@ -193,7 +193,7 @@ single_linear_combo_test (
     
     /* use the dedicated wNAF linear combo algorithm */
     copy_tw_extensible(&working, &text1);
-    linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, wnaf, 5);
+    linear_combo_var_fixed_vt(&working, scalar1, nbits1, scalar2, nbits2, (const tw_niels_a_t*)wnaf, 5);
     untwist_and_double_and_serialize(result_wnaf, &working);
     
     /* use the dedicated combs algorithm */

From 6d798e2940bedd6d4b64bc15d13d34ae08a73917 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 18:25:17 -0800
Subject: [PATCH 08/15] fix test issue found by scan-build

---
 test/test_pointops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_pointops.c b/test/test_pointops.c
index 5f0ec09..bf53afd 100644
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -295,7 +295,7 @@ int test_pointops (void) {
         }
         
         ret = single_twisting_test(&base);
-        //if (ret) return ret;
+        if (ret) return ret;
     }
     
     return 0;

From 57e721ec6b8c157e32ba6bdb8411f750d735124b Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Thu, 22 Jan 2015 18:52:04 -0800
Subject: [PATCH 09/15] fix(?) perf regr in verify pre

---
 src/scalarmul.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/scalarmul.c b/src/scalarmul.c
index af7f72a..93d9443 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -356,11 +356,13 @@ linear_combo_combs_vt (
     assert(table2->t >= 1);
 #endif
   
-    tw_niels_a_t ni;
+    const struct tw_niels_t *ni;
     
-    unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0;
+    unsigned int swords[2] = {scalar1b_words, scalar2b_words};
     word_t *scalars[2] = {scalar1b,scalar2b};
     
+    set_identity_tw_extensible(out);
+    
     for (i=0; i<smax; i++) {
         if (i) double_tw_extensible(out);
             
@@ -386,20 +388,12 @@ linear_combo_combs_vt (
                 tab ^= invert;
                 tab &= (1<<(table->t-1)) - 1;
             
-                copy_tw_niels(ni, table->table[tab + (j<<(table->t-1))]);
-                cond_negate_tw_niels(ni,invert);
+                ni = table->table[tab + (j<<(table->t-1))];
                 
-                if (started) {
-                    add_tw_niels_to_tw_extensible(out, ni);
-                } else {
-                    convert_tw_niels_to_tw_extensible(out, ni);
-                    started = 1;
-                }
-            
+                if (invert) sub_tw_niels_from_tw_extensible(out, ni);
+                else add_tw_niels_to_tw_extensible(out, ni);
             }
         }
-        
-        assert(started);
     }
     
     return MASK_SUCCESS;

From f4424c3d17fdddedf9b745a6bdc31029749b0633 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Fri, 23 Jan 2015 15:38:43 -0800
Subject: [PATCH 10/15] fix sha512 goof on p521

---
 src/goldilocks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/goldilocks.c b/src/goldilocks.c
index 32c9a12..866ed10 100644
--- a/src/goldilocks.c
+++ b/src/goldilocks.c
@@ -41,7 +41,7 @@
 #else
 #define FIELD_HASH_BYTES (SHA512_OUTPUT_BYTES * ((FIELD_BYTES-1)/SHA512_OUTPUT_BYTES + 1))
 static inline void field_hash_final (
-    sha512_ctx_a_t *ctx,
+    sha512_ctx_a_t ctx,
     unsigned char out[FIELD_HASH_BYTES]
 ) {
     /* SHA PRNG I guess? I really should have used SHAKE */

From de6d61e55437969f68fd0529f90456d7e1c79406 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Sun, 1 Feb 2015 10:16:49 -0800
Subject: [PATCH 11/15] fuse crandom seed+buffer because they are accessed as a
 single object.  probably needs a bit more testing though

---
 include/goldilocks.h  |  2 +-
 src/crandom.c         | 20 ++++++++++----------
 src/include/crandom.h |  7 ++++---
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/goldilocks.h b/include/goldilocks.h
index 1631c2f..e4d4496 100644
--- a/include/goldilocks.h
+++ b/include/goldilocks.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
+/* Copyright (c) 2014-2015 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
diff --git a/src/crandom.c b/src/crandom.c
index 4f36644..83999c9 100644
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -319,7 +319,7 @@ crandom_init_from_file(
 
     ssize_t offset = 0, red;
     do {
-        red = read(state->randomfd, state->seed + offset, 32 - offset);
+        red = read(state->randomfd, state->seedBuffer + offset, 32 - offset);
         if (red > 0) offset += red;
     } while (red > 0 && offset < 32);
 
@@ -328,7 +328,7 @@ crandom_init_from_file(
         return err ? err : -1;
     }
 
-    memset(state->buffer, 0, 96);
+    memset(state->seedBuffer+32, 0, 96);
 
     state->magic = CRANDOM_MAGIC;
     state->reseeds_mandatory = reseeds_mandatory;
@@ -341,8 +341,8 @@ crandom_init_from_buffer(
     crandom_state_a_t state,
     const char initial_seed[32]
 ) {
-    memcpy(state->seed, initial_seed, 32);
-    memset(state->buffer, 0, 96);
+    memcpy(state->seedBuffer, initial_seed, 32);
+    memset(state->seedBuffer+32, 0, 96);
     state->reseed_countdown = state->reseed_interval = state->fill = state->ctr = state->reseeds_mandatory = 0;
     state->randomfd = -1;
     state->magic = CRANDOM_MAGIC;
@@ -425,7 +425,7 @@ crandom_generate(
                     state->reseed_countdown = state->reseed_interval;
                     ssize_t offset = 0, red;
                     do {
-                        red = read(state->randomfd, state->buffer + offset, 32 - offset);
+                        red = read(state->randomfd, state->seedBuffer + 32 + offset, 32 - offset);
                         if (red > 0) offset += red;
                     } while (red > 0 && offset < 32);
 
@@ -454,19 +454,19 @@ crandom_generate(
                     int i;
                     for (i=0; i<32; i++) {
                         /* Stir in the buffer.  If somehow the read failed, it'll be zeros. */
-                        state->seed[i] ^= state->buffer[i];
+                        state->seedBuffer[i] ^= state->seedBuffer[i+32];
                     }
                 }
             }
-            crandom_chacha_expand(iv,state->ctr,20,128,state->seed,state->seed);
+            crandom_chacha_expand(iv,state->ctr,20,128,state->seedBuffer,state->seedBuffer);
             state->ctr++;
-            state->fill = sizeof(state->buffer);
+            state->fill = sizeof(state->seedBuffer)-32;
         }
 
         unsigned long long copy = (length > state->fill) ? state->fill : length;
         state->fill -= copy;
-        memcpy(output, state->buffer + state->fill, copy);
-        really_memset(state->buffer + state->fill, 0, copy);
+        memcpy(output, state->seedBuffer + 32 + state->fill, copy);
+        really_memset(state->seedBuffer + 32 + state->fill, 0, copy);
         output += copy; length -= copy;
     }
 
diff --git a/src/include/crandom.h b/src/include/crandom.h
index c9f4c26..06dc583 100644
--- a/src/include/crandom.h
+++ b/src/include/crandom.h
@@ -1,5 +1,5 @@
 /* Copyright (c) 2011 Stanford University.
- * Copyright (c) 2014 Cryptography Research, Inc.
+ * Copyright (c) 2014-2015 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
@@ -29,8 +29,9 @@
  */
 struct crandom_state_t {
     /** @privatesection */
-    unsigned char seed[32];
-    unsigned char buffer[96];
+    /* unsigned char seed[32]; */
+    /* unsigned char buffer[96]; */
+    unsigned char seedBuffer[32+96];
     uint64_t ctr;
     uint64_t magic;
     unsigned int fill;

From b981251732a95080777a54c27767b1fe4a2599c6 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Sun, 1 Mar 2015 13:17:51 -0800
Subject: [PATCH 12/15] fix rax input for rdrand detection

---
 src/crandom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crandom.c b/src/crandom.c
index 83999c9..4808d3e 100644
--- a/src/crandom.c
+++ b/src/crandom.c
@@ -27,10 +27,10 @@ unsigned int crandom_detect_features(void) {
     if (c & 1<<25) out |= AESNI;
     if (c & 1<<28) out |= AVX;
     if (b & 1<<5) out  |= AVX2;
+    if (c & 1<<30) out |= RDRAND;
     
     a=0x80000001; __asm__("cpuid" : "+a"(a), "=b"(b), "=c"(c), "=d"(d));
     if (c & 1<<11) out |= XOP;
-    if (c & 1<<30) out |= RDRAND;
 # endif
   
   return out;

From 5cf6038179e345d312f004cf332c8f27590cc221 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Sun, 1 Mar 2015 13:23:31 -0800
Subject: [PATCH 13/15] adjust history.txt.  Also, that last fix on RDRAND is
 thanks to John Mark Gurney.

---
 HISTORY.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/HISTORY.txt b/HISTORY.txt
index 983cbcb..4901d1a 100644
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -1,3 +1,7 @@
+March 1, 2015:
+    Not much to report.  Most of the fixes since Oct 27 last year have
+    been bug fixes or simplifications, or in the Decaf branch.
+
 October 27, 2014:
     Added more support for >512-bit primes.  Changed shared secret
     to not overflow the buffer in this case.  Changed hashing to

From 393785a384cf41ebf85cd078b4fe71eb2f46c528 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Sat, 7 Mar 2015 16:15:51 -0800
Subject: [PATCH 14/15] fix some mul/sqr-after-add bugs on
 arch_neon_experimental.  Deprecate arch_neon because the experiment seems to
 be a success anyway

---
 src/p448/arch_neon/arch_config.h       |   1 -
 src/p448/arch_neon/neon_emulation.h    | 155 ------
 src/p448/arch_neon/p448.c              | 723 -------------------------
 src/p448/arch_neon/p448.h              | 241 ---------
 src/p448/arch_neon_experimental/p448.c | 110 ++--
 test/test_arithmetic.c                 |  17 +-
 6 files changed, 69 insertions(+), 1178 deletions(-)
 delete mode 100644 src/p448/arch_neon/arch_config.h
 delete mode 100644 src/p448/arch_neon/neon_emulation.h
 delete mode 100644 src/p448/arch_neon/p448.c
 delete mode 100644 src/p448/arch_neon/p448.h

diff --git a/src/p448/arch_neon/arch_config.h b/src/p448/arch_neon/arch_config.h
deleted file mode 100644
index 47bbe3e..0000000
--- a/src/p448/arch_neon/arch_config.h
+++ /dev/null
@@ -1 +0,0 @@
-#define WORD_BITS 32
diff --git a/src/p448/arch_neon/neon_emulation.h b/src/p448/arch_neon/neon_emulation.h
deleted file mode 100644
index a97978c..0000000
--- a/src/p448/arch_neon/neon_emulation.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
- * Released under the MIT License.  See LICENSE.txt for license information.
- */
-
-/**
- * @file "neon_emulation.h"
- * @brief NEON intrinsic emulation using clang's vector extensions.
- *
- * This lets you test and debug NEON code on x86.
- */
-
-#ifndef __NEON_EMULATION_H__
-#define __NEON_EMULATION_H__ 1
-
-/** @cond internal */
-
-#include "word.h"
-
-#include <stdint.h>
-#include <assert.h>
-
-static __inline__ int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b) {
-    a.x += b.x;
-    a.y += b.y;
-    return a;
-}
-
-static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
-xx_vaddup_s64(int64x2_t x) {
-    x.y += x.x;
-    return x;
-}
-
-typedef struct { int32x2_t val[2]; } int32x2x2_t;
-static inline int32x2x2_t vtrn_s32 (int32x2_t x, int32x2_t y) {
-    int32x2x2_t out = {{{ x.x, y.x }, {x.y, y.y}}};
-    return out;
-}
-
-static __inline__ void __attribute__((gnu_inline,always_inline))
-xx_vtrnq_s64 (
-    int64x2_t *x,
-    int64x2_t *y
-) {
-    int64_t tmp = (*x).y;
-    (*x).y = (*y).x;
-    (*y).x = tmp;
-}
-
-int64x2_t vsraq_n_s64 (
-    int64x2_t a,
-    int64x2_t v,
-    const int x
-) {
-    return a + (v >> x);
-}
-
-int64x2_t vshrq_n_s64 (
-    int64x2_t v,
-    const int x
-) {
-    return v >> x;
-}
-
-static inline int64_t vgetq_lane_s64 (
-    int64x2_t acc,
-    const int lane
-) {
-    return lane ? acc.y : acc.x;
-}
-
-static inline int32_t vget_lane_s32 (
-    int32x2_t acc,
-    const int lane
-) {
-    return lane ? acc.y : acc.x;
-}
-
-static inline int64x2_t vmlal_lane_s32 (
-    int64x2_t acc,
-    int32x2_t x,
-    int32x2_t y,
-    int lane
-) {
-    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
-    return acc + xx*(lane?yy.yy:yy.xx);
-}
-
-static inline int64x2_t vmlsl_lane_s32 (
-    int64x2_t acc,
-    int32x2_t x,
-    int32x2_t y,
-    int lane
-) {
-    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
-    return acc - xx*(lane?yy.yy:yy.xx);
-}
-
-static inline int64x2_t vqdmlsl_lane_s32 (
-    int64x2_t acc,
-    int32x2_t x,
-    int32x2_t y,
-    int lane
-) {
-    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
-    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
-    assert(tmp.x >> 63 == tmp.x>>62);
-    assert(tmp.y >> 63 == tmp.y>>62);
-    return acc - 2*tmp;
-}
-
-static inline int64x2_t vqdmlal_lane_s32 (
-    int64x2_t acc,
-    int32x2_t x,
-    int32x2_t y,
-    int lane
-) {
-    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
-    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
-    assert(tmp.x >> 63 == tmp.x>>62);
-    assert(tmp.y >> 63 == tmp.y>>62);
-    return acc + 2*tmp;
-}
-
-static inline int64x2_t vqdmull_lane_s32 (
-    int32x2_t x,
-    int32x2_t y,
-    int lane
-) {
-    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
-    int64x2_t tmp = xx*(lane?yy.yy:yy.xx);
-    assert(tmp.x >> 63 == tmp.x>>62);
-    assert(tmp.y >> 63 == tmp.y>>62);
-    return 2*tmp;
-}
-
-static inline int32x2_t vmovn_s64(
-    int64x2_t x
-) {
-    int32x2_t y = {x.x,x.y};
-    return y;
-}
-
-static inline int64x2_t vmull_lane_s32 (
-    int32x2_t x,
-    int32x2_t y,
-    int lane
-) {
-    int64x2_t xx = { x.x, x.y }, yy = { y.x, y.y };
-    return xx*(lane?yy.yy:yy.xx);
-}
-
-/** @endcond */
-
-#endif /* __NEON_EMULATION_H__ */
diff --git a/src/p448/arch_neon/p448.c b/src/p448/arch_neon/p448.c
deleted file mode 100644
index 956f356..0000000
--- a/src/p448/arch_neon/p448.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
- * Released under the MIT License.  See LICENSE.txt for license information.
- */
-
-#include "word.h"
-#include "p448.h"
-
-static inline mask_t __attribute__((always_inline))
-is_zero (
-    word_t x
-) {
-    dword_t xx = x;
-    xx--;
-    return xx >> WORD_BITS;
-}
-
-static uint64_t widemul_32 (
-    const uint32_t a,
-    const uint32_t b
-) {
-    return ((uint64_t)a)* b;
-}
-
-#ifdef __ARM_NEON__
-static __inline__ void __attribute__((gnu_inline,always_inline))
-xx_vtrnq_s64 (
-    int64x2_t *x,
-    int64x2_t *y
-) {
-    __asm__ __volatile__ ("vswp %f0, %e1" : "+w"(*x), "+w"(*y));
-}
-
-static __inline__ int64x2_t __attribute__((gnu_inline,always_inline))
-xx_vaddup_s64(int64x2_t x) {
-    __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
-    return x;
-}
-#else
-#include "neon_emulation.h"
-#endif /* ARM_NEON */
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smlal (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
-    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smlal2 (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
-    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smull (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
-    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smull2 (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
-    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
-}
-
-void
-p448_mul (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    const p448_t *bs
-) {
-    const uint32_t *a = as->limb, *b = bs->limb;
-    uint32_t *c = cs->limb;
-    
-    
-    const int32x2_t
-        *val = (const int32x2_t *)a,
-        *vbl = (const int32x2_t *)b,
-        *vah = (const int32x2_t *)(&a[8]),
-        *vbh = (const int32x2_t *)(&b[8]);
-    
-    int32x2_t
-        *vcl = (int32x2_t *)c,
-        *vch = (int32x2_t *)(&c[8]),
-        vmask = {(1<<28) - 1, (1<<28)-1};
-
-    int64x2_t accumx0a, accumx0b;
-    int64x2_t accumx1a, accumx1b;
-    int64x2_t accumx2a, accumx2b;
-    int64x2_t accumx3a, accumx3b;
-    int64x2_t accumx4a, accumx4b;
-    int64x2_t accumx5a, accumx5b;
-    int64x2_t accumx6a, accumx6b;
-    int64x2_t accumx7a, accumx7b;
-    int64x2_t carry;
-    int32x2x2_t trn_res;
-    int32x2_t delta;
-    
-    accumx0a = vmull_lane_s32(          delta = val[1] + vah[1], vbh[3], 0);
-    accumx1a = vmull_lane_s32(          delta, vbh[3], 1);
-    accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0);
-    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
-    accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0);
-    accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
-    accumx0b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[0], 0);
-    accumx1b = vmull_lane_s32(          delta, vbh[0], 1);
-    accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
-    accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
-    accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
-    accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
-    accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
-    accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
-    accumx0b += accumx0a;
-    accumx1b += accumx1a;
-    accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
-    accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
-    accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
-    accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
-    accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
-    accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
-    accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
-    accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
-    accumx0a += accumx0b;
-    accumx1a += accumx1b;
-    accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
-    accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
-    xx_vtrnq_s64(&accumx0a, &accumx0b);
-    xx_vtrnq_s64(&accumx1a, &accumx1b);
-    accumx0b += accumx1a;
-    accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
-    accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
-    trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
-    vcl[0] = trn_res.val[1] & vmask;
-    vch[0] = trn_res.val[0] & vmask;
-    
-    
-    
-    
-    accumx2a = vmull_lane_s32(          delta = val[2] + vah[2], vbh[3], 0);
-    accumx3a = vmull_lane_s32(          delta, vbh[3], 1);
-    accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
-    accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
-    accumx2b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[1], 0);
-    accumx3b = vmull_lane_s32(          delta, vbh[1], 1);
-    accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
-    accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
-    accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
-    accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
-    accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
-    accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
-    accumx2b += accumx2a;
-    accumx3b += accumx3a;
-    accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
-    accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
-    accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
-    accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
-    accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0);
-    accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
-    accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0);
-    accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
-    accumx2a += accumx2b;
-    accumx3a += accumx3b;
-    accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0);
-    accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
-    accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0);
-    accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
-    xx_vtrnq_s64(&accumx2a, &accumx2b);
-    xx_vtrnq_s64(&accumx3a, &accumx3b);
-    accumx2a += accumx1b;
-    accumx2b += accumx3a;
-    accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
-    accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
-    trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
-    vcl[1] = trn_res.val[1] & vmask;
-    vch[1] = trn_res.val[0] & vmask;
-    carry = accumx3b;
-    
-    
-    
-    
-    accumx4a = vmull_lane_s32(          delta = val[3] + vah[3], vbh[3], 0);
-    accumx5a = vmull_lane_s32(          delta, vbh[3], 1);
-    accumx4b = accumx4a;
-    accumx5b = accumx5a;
-    accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
-    accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
-    accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
-    accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
-    accumx4a += accumx4b;
-    accumx5a += accumx5b;
-    accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
-    accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
-    accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
-    accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
-    accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
-    accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
-    accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
-    accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
-    /**/
-    accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
-    accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
-    accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
-    accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
-    
-    xx_vtrnq_s64(&accumx4a, &accumx4b);
-    xx_vtrnq_s64(&accumx5a, &accumx5b);
-    accumx4a += carry;
-    accumx4b += accumx5a;
-    accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
-    accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
-    
-    trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
-    vcl[2] = trn_res.val[1] & vmask;
-    vch[2] = trn_res.val[0] & vmask;
-    
-    
-    
-    
-    accumx6b = vmull_lane_s32(          delta = val[0] + vah[0], vbh[3], 0);
-    accumx7b = vmull_lane_s32(          delta, vbh[3], 1);
-    accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
-    accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
-    accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
-    accumx6a = accumx6b;
-    accumx7a = accumx7b;
-    accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
-    accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
-    accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
-    accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
-    accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
-    accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
-    accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
-    accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
-    /**/
-    accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
-    accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
-    accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
-    accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0);
-    accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
-
-    xx_vtrnq_s64(&accumx6a, &accumx6b);
-    xx_vtrnq_s64(&accumx7a, &accumx7b);
-    accumx6a += accumx5b;
-    accumx6b += accumx7a;
-    
-    accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
-    accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
-    trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
-    vcl[3] = trn_res.val[1] & vmask;
-    vch[3] = trn_res.val[0] & vmask;
-    
-    
-    accumx7b = xx_vaddup_s64(accumx7b);
-
-    int32x2_t t0 = vcl[0], t1 = vch[0];
-    trn_res = vtrn_s32(t0,t1);
-    t0 = trn_res.val[0]; t1 = trn_res.val[1];
-    
-    accumx7b = vaddw_s32(accumx7b, t0);
-    t0 = vmovn_s64(accumx7b) & vmask;
-    
-    accumx7b = vshrq_n_s64(accumx7b,28);
-    accumx7b = vaddw_s32(accumx7b, t1);
-    t1 = vmovn_s64(accumx7b) & vmask;
-    trn_res = vtrn_s32(t0,t1);
-    vcl[0] = trn_res.val[0];
-    vch[0] = trn_res.val[1];
-    accumx7b = vshrq_n_s64(accumx7b,28);
-
-    t0 = vmovn_s64(accumx7b);
-    
-    uint32_t
-        c0 = vget_lane_s32(t0,0),
-        c1 = vget_lane_s32(t0,1);
-    c[2]  += c0;
-    c[10] += c1;
-}
-
-void
-p448_sqr (
-    p448_t *__restrict__ cs,
-    const p448_t *as
-) {
-    /* FUTURE possible improvements:
-     *    don't use nega-phi algorithm, so as to avoid extra phi-twiddle at end
-     *        or use phi/nega-phi for everything, montgomery style
-     *        or find some sort of phi algorithm which doesn't have this problem
-     *    break up lanemuls so that only diags get 1mul'd instead of diag 2x2 blocks
-     *
-     * These improvements are all pretty minor, but I guess together they might matter?
-     */
-    
-    const uint32_t *b = as->limb;
-    uint32_t *c = cs->limb;
-
-    int32x2_t vbm[4];
-    
-    const int32x2_t
-        *vbl = (const int32x2_t *)b,
-        *vbh = (const int32x2_t *)(&b[8]);
-        
-    int i;
-    for (i=0; i<4; i++) {
-        vbm[i] = vbl[i] - vbh[i];
-    }
-    
-    int32x2_t
-        *vcl = (int32x2_t *)c,
-        *vch = (int32x2_t *)(&c[8]),
-        vmask = {(1<<28) - 1, (1<<28)-1};
-
-    int64x2_t accumx0a, accumx0b;
-    int64x2_t accumx1a, accumx1b;
-    int64x2_t accumx2a, accumx2b;
-    int64x2_t accumx3a, accumx3b;
-    int64x2_t accumx4a, accumx4b;
-    int64x2_t accumx5a, accumx5b;
-    int64x2_t accumx6a, accumx6b;
-    int64x2_t accumx7a, accumx7b;
-    int64x2_t carry;
-    int32x2x2_t trn_res;
-    
-    accumx0a = vqdmull_lane_s32(          vbh[1], vbh[3], 0);
-    accumx1a = vqdmull_lane_s32(          vbh[1], vbh[3], 1);
-    accumx2a = vqdmull_lane_s32(          vbh[2], vbh[3], 0);
-    accumx3a = vqdmull_lane_s32(          vbh[2], vbh[3], 1);
-    accumx0a =   vmlal_lane_s32(accumx0a, vbh[2], vbh[2], 0);
-    accumx1a =   vmlal_lane_s32(accumx1a, vbh[2], vbh[2], 1);
-    accumx2b = accumx2a;
-    accumx3b = accumx3a;
-    accumx2b = vqdmlal_lane_s32(accumx2b, vbh[0], vbh[1], 0);
-    accumx3b = vqdmlal_lane_s32(accumx3b, vbh[0], vbh[1], 1);
-    accumx0b = accumx0a;
-    accumx1b = accumx1a;
-    accumx0b =   vmlal_lane_s32(accumx0b, vbh[0], vbh[0], 0);
-    accumx1b =   vmlal_lane_s32(accumx1b, vbh[0], vbh[0], 1);
-    accumx0b = vqdmlal_lane_s32(accumx0b, vbl[1], vbl[3], 0);
-    accumx1b = vqdmlal_lane_s32(accumx1b, vbl[1], vbl[3], 1);
-    accumx2b = vqdmlal_lane_s32(accumx2b, vbl[2], vbl[3], 0);
-    accumx3b = vqdmlal_lane_s32(accumx3b, vbl[2], vbl[3], 1);
-    accumx0b =   vmlal_lane_s32(accumx0b, vbl[2], vbl[2], 0);
-    accumx1b =   vmlal_lane_s32(accumx1b, vbl[2], vbl[2], 1);
-    accumx2a += accumx2b;
-    accumx3a += accumx3b;
-    accumx2a = vqdmlal_lane_s32(accumx2a, vbl[0], vbl[1], 0);
-    accumx3a = vqdmlal_lane_s32(accumx3a, vbl[0], vbl[1], 1);
-    accumx0a += accumx0b;
-    accumx1a += accumx1b;
-    accumx0a =   vmlal_lane_s32(accumx0a, vbl[0], vbl[0], 0);
-    accumx1a =   vmlal_lane_s32(accumx1a, vbl[0], vbl[0], 1);
-    accumx0a = vqdmlsl_lane_s32(accumx0a, vbm[1], vbm[3], 0);
-    accumx1a = vqdmlsl_lane_s32(accumx1a, vbm[1], vbm[3], 1);
-    accumx0a =   vmlsl_lane_s32(accumx0a, vbm[2], vbm[2], 0);
-    accumx1a =   vmlsl_lane_s32(accumx1a, vbm[2], vbm[2], 1);
-    accumx2a = vqdmlsl_lane_s32(accumx2a, vbm[2], vbm[3], 0);
-    accumx3a = vqdmlsl_lane_s32(accumx3a, vbm[2], vbm[3], 1);
-    accumx0b += accumx0a;
-    accumx1b += accumx1a;
-    accumx0b =   vmlsl_lane_s32(accumx0b, vbm[0], vbm[0], 0);
-    accumx1b =   vmlsl_lane_s32(accumx1b, vbm[0], vbm[0], 1);
-    accumx2b += accumx2a;
-    accumx3b += accumx3a;
-    accumx2b = vqdmlsl_lane_s32(accumx2b, vbm[0], vbm[1], 0);
-    accumx3b = vqdmlsl_lane_s32(accumx3b, vbm[0], vbm[1], 1);
-    xx_vtrnq_s64(&accumx0b, &accumx0a);
-    xx_vtrnq_s64(&accumx1b, &accumx1a);
-    xx_vtrnq_s64(&accumx2b, &accumx2a);
-    xx_vtrnq_s64(&accumx3b, &accumx3a);
-    accumx0a += accumx1b;
-    accumx0a = vsraq_n_s64(accumx0a,accumx0b,28);
-    accumx1a = vsraq_n_s64(accumx1a,accumx0a,28);
-    accumx2b += accumx1a;
-    accumx2a += accumx3b;
-    accumx2a = vsraq_n_s64(accumx2a,accumx2b,28);
-    accumx3a = vsraq_n_s64(accumx3a,accumx2a,28);
-    trn_res = vtrn_s32(vmovn_s64(accumx0b), vmovn_s64(accumx0a));
-    vcl[0] = trn_res.val[1] & vmask;
-    vch[0] = trn_res.val[0] & vmask;
-    trn_res = vtrn_s32(vmovn_s64(accumx2b), vmovn_s64(accumx2a));
-    vcl[1] = trn_res.val[1] & vmask;
-    vch[1] = trn_res.val[0] & vmask;
-    carry = accumx3a;
-    
-    accumx4a =   vmull_lane_s32(          vbh[3], vbh[3], 0);
-    accumx5a =   vmull_lane_s32(          vbh[3], vbh[3], 1);
-    accumx6b = vqdmull_lane_s32(          vbh[0], vbh[3], 0);
-    accumx7b = vqdmull_lane_s32(          vbh[0], vbh[3], 1);
-    accumx4b = accumx4a;
-    accumx5b = accumx5a;
-    accumx4b = vqdmlal_lane_s32(accumx4b, vbh[0], vbh[2], 0);
-    accumx5b = vqdmlal_lane_s32(accumx5b, vbh[0], vbh[2], 1);
-    accumx6b = vqdmlal_lane_s32(accumx6b, vbh[1], vbh[2], 0);
-    accumx7b = vqdmlal_lane_s32(accumx7b, vbh[1], vbh[2], 1);
-    accumx4b =   vmlal_lane_s32(accumx4b, vbh[1], vbh[1], 0);
-    accumx5b =   vmlal_lane_s32(accumx5b, vbh[1], vbh[1], 1);
-    accumx4b =   vmlal_lane_s32(accumx4b, vbl[3], vbl[3], 0);
-    accumx5b =   vmlal_lane_s32(accumx5b, vbl[3], vbl[3], 1);
-    accumx6a = accumx6b;
-    accumx7a = accumx7b;
-    accumx6a = vqdmlal_lane_s32(accumx6a, vbl[0], vbl[3], 0);
-    accumx7a = vqdmlal_lane_s32(accumx7a, vbl[0], vbl[3], 1);
-    accumx4a += accumx4b;
-    accumx5a += accumx5b;
-    accumx4a = vqdmlal_lane_s32(accumx4a, vbl[0], vbl[2], 0);
-    accumx5a = vqdmlal_lane_s32(accumx5a, vbl[0], vbl[2], 1);
-    accumx6a = vqdmlal_lane_s32(accumx6a, vbl[1], vbl[2], 0);
-    accumx7a = vqdmlal_lane_s32(accumx7a, vbl[1], vbl[2], 1);
-    accumx4a =   vmlal_lane_s32(accumx4a, vbl[1], vbl[1], 0);
-    accumx5a =   vmlal_lane_s32(accumx5a, vbl[1], vbl[1], 1);
-    accumx4a =   vmlsl_lane_s32(accumx4a, vbm[3], vbm[3], 0);
-    accumx5a =   vmlsl_lane_s32(accumx5a, vbm[3], vbm[3], 1);
-    accumx6b += accumx6a;
-    accumx7b += accumx7a;
-    accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[0], vbm[3], 0);
-    accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[0], vbm[3], 1);
-    accumx4b += accumx4a;
-    accumx5b += accumx5a;
-    accumx4b = vqdmlsl_lane_s32(accumx4b, vbm[0], vbm[2], 0);
-    accumx5b = vqdmlsl_lane_s32(accumx5b, vbm[0], vbm[2], 1);
-    accumx4b =   vmlsl_lane_s32(accumx4b, vbm[1], vbm[1], 0);
-    accumx5b =   vmlsl_lane_s32(accumx5b, vbm[1], vbm[1], 1);
-    accumx6b = vqdmlsl_lane_s32(accumx6b, vbm[1], vbm[2], 0);
-    accumx7b = vqdmlsl_lane_s32(accumx7b, vbm[1], vbm[2], 1);
-    
-    xx_vtrnq_s64(&accumx4b, &accumx4a);
-    xx_vtrnq_s64(&accumx5b, &accumx5a);
-    xx_vtrnq_s64(&accumx6b, &accumx6a);
-    xx_vtrnq_s64(&accumx7b, &accumx7a);
-    accumx4b += carry;
-    accumx4a += accumx5b;
-    accumx4a = vsraq_n_s64(accumx4a,accumx4b,28);
-    accumx5a = vsraq_n_s64(accumx5a,accumx4a,28);
-    accumx6b += accumx5a;
-    accumx6a += accumx7b;
-    
-    trn_res = vtrn_s32(vmovn_s64(accumx4b), vmovn_s64(accumx4a));
-    vcl[2] = trn_res.val[1] & vmask;
-    vch[2] = trn_res.val[0] & vmask;
-    accumx6a = vsraq_n_s64(accumx6a,accumx6b,28);
-    accumx7a = vsraq_n_s64(accumx7a,accumx6a,28);
-    trn_res = vtrn_s32(vmovn_s64(accumx6b), vmovn_s64(accumx6a));
-    vcl[3] = trn_res.val[1] & vmask;
-    vch[3] = trn_res.val[0] & vmask;
-    
-    accumx7a = xx_vaddup_s64(accumx7a);
-
-    int32x2_t t0 = vcl[0], t1 = vch[0];
-    trn_res = vtrn_s32(t0,t1);
-    t0 = trn_res.val[0]; t1 = trn_res.val[1];
-    
-    accumx7a = vaddw_s32(accumx7a, t0);
-    t0 = vmovn_s64(accumx7a) & vmask;
-    
-    accumx7a = vshrq_n_s64(accumx7a,28);
-    accumx7a = vaddw_s32(accumx7a, t1);
-    t1 = vmovn_s64(accumx7a) & vmask;
-    trn_res = vtrn_s32(t0,t1);
-    vcl[0] = trn_res.val[0];
-    vch[0] = trn_res.val[1];
-    accumx7a = vshrq_n_s64(accumx7a,28);
-
-    t0 = vmovn_s64(accumx7a);
-    
-    uint32_t
-        c0 = vget_lane_s32(t0,0),
-        c1 = vget_lane_s32(t0,1);
-    c[2]  += c0;
-    c[10] += c1;
-}
-
-void
-p448_mulw (
-    p448_t *__restrict__ cs,
-    const p448_t *as,
-    uint64_t b
-) {
-    const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
-    
-    const uint32_t *a = as->limb;
-    uint32_t *c = cs->limb;
-
-    uint64_t accum0, accum8;
-    uint32_t mask = (1ull<<28)-1;  
-
-    int i;
-
-    uint32_t c0, c8, n0, n8;
-    accum0 = widemul_32(bhi, a[15]);
-    accum8 = widemul_32(bhi, a[15] + a[7]);
-    c0 = a[0]; c8 = a[8];
-    smlal(&accum0, blo, c0);
-    smlal(&accum8, blo, c8);
-
-    c[0] = accum0 & mask; accum0 >>= 28;
-    c[8] = accum8 & mask; accum8 >>= 28;
-    
-    i=1;
-    {
-        n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
-        
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-    {
-        c0 = a[i]; c8 = a[i+8];
-        smlal(&accum0, bhi, n0);
-        smlal(&accum8, bhi, n8);
-        smlal(&accum0, blo, c0);
-        smlal(&accum8, blo, c8);
-
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-    {
-        n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
-
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-    {
-        c0 = a[i]; c8 = a[i+8];
-        smlal(&accum0, bhi, n0);
-        smlal(&accum8, bhi, n8);
-        smlal(&accum0, blo, c0);
-        smlal(&accum8, blo, c8);
-
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-    {
-        n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
-
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-    {
-        c0 = a[i]; c8 = a[i+8];
-        smlal(&accum0, bhi, n0);
-        smlal(&accum8, bhi, n8);
-        smlal(&accum0, blo, c0);
-        smlal(&accum8, blo, c8);
-        
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-    {
-        n0 = a[i]; n8 = a[i+8];
-        smlal(&accum0, bhi, c0);
-        smlal(&accum8, bhi, c8);
-        smlal(&accum0, blo, n0);
-        smlal(&accum8, blo, n8);
-
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-        i++;
-    }
-
-    accum0 += accum8 + c[8];
-    c[8] = accum0 & mask;
-    c[9] += accum0 >> 28;
-
-    accum8 += c[0];
-    c[0] = accum8 & mask;
-    c[1] += accum8 >> 28;
-}
-
-void
-p448_strong_reduce (
-    p448_t *a
-) {
-    word_t mask = (1ull<<28)-1;
-
-    /* first, clear high */
-    a->limb[8] += a->limb[15]>>28;
-    a->limb[0] += a->limb[15]>>28;
-    a->limb[15] &= mask;
-
-    /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
-
-    /* compute total_value - p.  No need to reduce mod p. */
-
-    dsword_t scarry = 0;
-    int i;
-    for (i=0; i<16; i++) {
-        scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
-        a->limb[i] = scarry & mask;
-        scarry >>= 28;
-    }
-
-    /* uncommon case: it was >= p, so now scarry = 0 and this = x
-    * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
-    * so let's add back in p.  will carry back off the top for 2^448.
-    */
-
-    assert(is_zero(scarry) | is_zero(scarry+1));
-
-    word_t scarry_mask = scarry & mask;
-    dword_t carry = 0;
-
-    /* add it back */
-    for (i=0; i<16; i++) {
-        carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
-        a->limb[i] = carry & mask;
-        carry >>= 28;
-    }
-
-    assert(is_zero(carry + scarry));
-}
-
-mask_t
-p448_is_zero (
-    const struct p448_t *a
-) {
-    struct p448_t b;
-    p448_copy(&b,a);
-    p448_strong_reduce(&b);
-
-    uint32_t any = 0;
-    int i;
-    for (i=0; i<16; i++) {
-        any |= b.limb[i];
-    }
-    return is_zero(any);
-}
-
-void
-p448_serialize (
-    uint8_t *serial,
-    const struct p448_t *x
-) {
-    int i,j;
-    p448_t red;
-    p448_copy(&red, x);
-    p448_strong_reduce(&red);
-    for (i=0; i<8; i++) {
-        uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
-        for (j=0; j<7; j++) {
-            serial[7*i+j] = limb;
-            limb >>= 8;
-        }
-        assert(limb == 0);
-    }
-}
-
-mask_t
-p448_deserialize (
-    p448_t *x,
-    const uint8_t serial[56]
-) {
-    int i,j;
-    for (i=0; i<8; i++) {
-        uint64_t out = 0;
-        for (j=0; j<7; j++) {
-            out |= ((uint64_t)serial[7*i+j])<<(8*j);
-        }
-        x->limb[2*i] = out & ((1ull<<28)-1);
-        x->limb[2*i+1] = out >> 28;
-    }
-    
-    /* Check for reduction.
-     *
-     * The idea is to create a variable ge which is all ones (rather, 56 ones)
-     * if and only if the low $i$ words of $x$ are >= those of p.
-     *
-     * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
-     */
-    uint32_t ge = -1, mask = (1ull<<28)-1;
-    for (i=0; i<8; i++) {
-        ge &= x->limb[i];
-    }
-    
-    /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
-    ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
-    
-    /* Propagate the rest */
-    for (i=9; i<16; i++) {
-        ge &= x->limb[i];
-    }
-    
-    return ~is_zero(ge ^ mask);
-}
diff --git a/src/p448/arch_neon/p448.h b/src/p448/arch_neon/p448.h
deleted file mode 100644
index f0406cd..0000000
--- a/src/p448/arch_neon/p448.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
- * Released under the MIT License.  See LICENSE.txt for license information.
- */
-#ifndef __P448_H__
-#define __P448_H__ 1
-
-#include "word.h"
-
-#include <stdint.h>
-#include <assert.h>
-
-typedef struct p448_t {
-  uint32_t limb[16];
-} __attribute__((aligned(32))) p448_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static __inline__ void
-p448_set_ui (
-    p448_t *out,
-    uint64_t x
-) __attribute__((unused,always_inline));
-
-static __inline__ void
-p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p448_neg_RAW (
-    p448_t *out,
-    const p448_t *a
-) __attribute__((unused,always_inline));
-
-static __inline__ void
-p448_addw (
-    p448_t *a,
-    uint32_t x
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p448_subw (
-    p448_t *a,
-    uint32_t x
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p448_copy (
-    p448_t *out,
-    const p448_t *a
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p448_weak_reduce (
-    p448_t *inout
-) __attribute__((unused,always_inline));
-             
-void
-p448_strong_reduce (
-    p448_t *inout
-);
-
-mask_t
-p448_is_zero (
-    const p448_t *in
-);
-             
-static __inline__ void
-p448_bias (
-    p448_t *inout,
-    int amount
-) __attribute__((unused,always_inline));
-
-void
-p448_mul (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    const p448_t *b
-);
-
-void
-p448_mulw (
-    p448_t *__restrict__ out,
-    const p448_t *a,
-    uint64_t b
-);
-
-void
-p448_sqr (
-    p448_t *__restrict__ out,
-    const p448_t *a
-);
-
-void
-p448_serialize (
-    uint8_t *serial,
-    const struct p448_t *x
-);
-
-mask_t
-p448_deserialize (
-    p448_t *x,
-    const uint8_t serial[56]
-);
-
-/* -------------- Inline functions begin here -------------- */
-
-void
-p448_set_ui (
-    p448_t *out,
-    uint64_t x
-) {
-    int i;
-    out->limb[0] = x & ((1<<28)-1);
-    out->limb[1] = x>>28;
-    for (i=2; i<16; i++) {
-      out->limb[i] = 0;
-    }
-}
-
-void
-p448_add_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
-) {
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
-    }
-    /*
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] + b->limb[i];
-    }
-    */
-}
-
-void
-p448_sub_RAW (
-    p448_t *out,
-    const p448_t *a,
-    const p448_t *b
-) {
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
-    }
-    /*
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] - b->limb[i];
-    }
-    */
-}
-
-void
-p448_neg_RAW (
-    p448_t *out,
-    const p448_t *a
-) {
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
-    }
-    /*
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = -a->limb[i];
-    }
-    */
-}
-
-void
-p448_addw (
-    p448_t *a,
-    uint32_t x
-) {
-  a->limb[0] += x;
-}
-             
-void
-p448_subw (
-    p448_t *a,
-    uint32_t x
-) {
-  a->limb[0] -= x;
-}
-
-void
-p448_copy (
-    p448_t *out,
-    const p448_t *a
-) {
-  *out = *a;
-}
-
-void
-p448_bias (
-    p448_t *a,
-    int amt
-) {
-    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
-    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
-    uint32x4_t *aa = (uint32x4_t*) a;
-    aa[0] += lo;
-    aa[1] += lo;
-    aa[2] += hi;
-    aa[3] += lo;
-}
-
-void
-p448_weak_reduce (
-    p448_t *a
-) {
-    uint64_t mask = (1ull<<28) - 1;
-    uint64_t tmp = a->limb[15] >> 28;
-    int i;
-    a->limb[8] += tmp;
-    for (i=15; i>0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
-    }
-    a->limb[0] = (a->limb[0] & mask) + tmp;
-}
-
-#ifdef __cplusplus
-}; /* extern "C" */
-#endif
-
-#endif /* __P448_H__ */
diff --git a/src/p448/arch_neon_experimental/p448.c b/src/p448/arch_neon_experimental/p448.c
index 0e2dc5d..6338d24 100644
--- a/src/p448/arch_neon_experimental/p448.c
+++ b/src/p448/arch_neon_experimental/p448.c
@@ -179,7 +179,7 @@ p448_mul (
         VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
                 
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
             VOP2(vmovn.i64,_a0b_0,_a0b)
                 
@@ -190,7 +190,7 @@ p448_mul (
         VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
             VOP2(vmovn.i64,_a0b_1,_a1b)
         VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
-            VOP3(vsra.s64,_a1a,_a1b,"#28")
+            VOP3(vsra.u64,_a1a,_a1b,"#28")
         VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
             VOP2(vbic.i32,_a0b,"#0xf0000000")
         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
@@ -227,7 +227,7 @@ p448_mul (
         VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
         VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
                                         
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
             VOP2(vmovn.i64,_a0b_0,_a0b)
                         
@@ -237,7 +237,7 @@ p448_mul (
         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
             VOP2(vmovn.i64,_a0b_1,_a1b)
         VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
-            VOP3(vsra.s64,_a1a,_a1b,"#28")
+            VOP3(vsra.u64,_a1a,_a1b,"#28")
         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
             VOP2(vbic.i32,_a0b,"#0xf0000000")
         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
@@ -275,7 +275,7 @@ p448_mul (
         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
                                                                 
             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP2(vmovn.i64,_a0b_0,_a0b)
                         
             VOP2(vswp,_a1b_1,_a1a_0)
@@ -284,7 +284,7 @@ p448_mul (
         VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
             VOP2(vmovn.i64,_a0b_1,_a1b)
         VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
-            VOP3(vsra.s64,_a1a,_a1b,"#28")
+            VOP3(vsra.u64,_a1a,_a1b,"#28")
         VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
             VOP2(vbic.i32,_a0b,"#0xf0000000")
         VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
@@ -321,14 +321,14 @@ p448_mul (
         VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
         VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
                         
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP2(vmovn.i64,_a0b_0,_a0b)
                                                                                             
             VOP2(vswp,_a1b_1,_a1a_0)
             VOP3(vadd.i64,_a0a,_a0a,_a1b)
 
             VOP2(vmovn.i64,_a0b_1,_a0a)
-            VOP3(vsra.s64,_a1a,_a0a,"#28")
+            VOP3(vsra.u64,_a1a,_a0a,"#28")
                                                                                             
             VOP2(vbic.i32,_a0b,"#0xf0000000") 
                                                                                             
@@ -376,43 +376,43 @@ p448_sqr (
 
     __asm__ __volatile__ (
         "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
-        VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
-        VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
-        VOP3(vadd.i32,_as0,_bl0,_bh0)
+        VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
+        VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
+        VOP3(vadd.i32,_as0,_bl0,_bh0)       /* 0 .. 2^30 */
             
         "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
-        VOP3(vadd.i32,_bs2,_bl2,_bh2)
+        VOP3(vadd.i32,_bs2,_bl2,_bh2)       /* 0 .. 2^30 */
         VOP2(vmov,_as2,_bs2)
         
-        VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0)
-        VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
-        VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
+        VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58.  danger for vqdmlal is 32 */
+        VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)   /* 0 .. 12 */
+        VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)   /* 0 .. 14 */
             
-        VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1)
-        VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
-        VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
+        VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
+        VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)   /* 0 .. 14 */
+        VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)   /* 0 .. 16 */
             
-        VOP2(vmov,_a0a,_a0b)
-        VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0)
-        VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0)
-        VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0)
+        VOP2(vmov,_a0a,_a0b)                   /* 0 .. 14 */
+        VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
+        VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0)   /* 0 .. 17 */
+        VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0)   /* 0 .. 18 */
             
-        VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0)
-        VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0)
-        VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0)
+        VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
+        VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0)   /*-3 .. 14 */
+        VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0)   /*-4 .. 15 */
             
         VOP2(vmov,_a1a,_a1b)
-        VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1)
-        VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1)
-        VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1)
+        VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
+        VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1)   /* 0 .. 19 */
+        VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1)   /* 0 .. 20 */
             
             VOP2(vswp,_a0b_1,_a0a_0)
             
-        VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1)
-        VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1)
-        VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1)
+        VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
+        VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1)   /*-3 .. 16 */
+        VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1)   /*-4 .. 17 */
                 
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
             VOP2(vmovn.i64,_a0b_0,_a0b)
                 
@@ -420,35 +420,35 @@ p448_sqr (
             VOP3(vadd.i64,_a1b,_a0a,_a1b)
                     
                     
-        VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0)
+        VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
             VOP2(vmovn.i64,_a0b_1,_a1b)
-            VOP3(vsra.s64,_a1a,_a1b,"#28")
-        VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0)
+            VOP3(vsra.u64,_a1a,_a1b,"#28")
+        VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
             VOP2(vbic.i32,_a0b,"#0xf0000000")
             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
                     
-        VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1)
-        VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1)
+        VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
+        VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
 
-        VOP2(vmov,_a0b_1,_a0a_1)
-        VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
-        VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
-        VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0)
-        VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0)
+        VOP2(vmov,_a0b,_a0a)               /* 0 .. 12 */
+        VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
+        VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
 
-        VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0)
-        VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0)
+        VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
+        VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
+        VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
+        VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
 
-        VOP2(vmov,_a1a,_a1b)
-        VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1)
-        VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1)
+        VOP2(vmov,_a1a,_a1b)                   /* 0 .. 12 */
+        VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
+        VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
 
             VOP2(vswp,_a0b_1,_a0a_0)
 
-        VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1)
-        VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1)
+        VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
+        VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
                                         
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
             VOP2(vmovn.i64,_a0b_0,_a0b)
                         
@@ -458,7 +458,7 @@ p448_sqr (
         VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
             VOP2(vmovn.i64,_a0b_1,_a1b)
         VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
-            VOP3(vsra.s64,_a1a,_a1b,"#28")
+            VOP3(vsra.u64,_a1a,_a1b,"#28")
         VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
             VOP2(vbic.i32,_a0b,"#0xf0000000")
             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
@@ -490,7 +490,7 @@ p448_sqr (
         VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
                                                                 
             VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP2(vmovn.i64,_a0b_0,_a0b)
                         
             VOP2(vswp,_a1b_1,_a1a_0)
@@ -498,7 +498,7 @@ p448_sqr (
 
         VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
             VOP2(vmovn.i64,_a0b_1,_a1b)
-            VOP3(vsra.s64,_a1a,_a1b,"#28")
+            VOP3(vsra.u64,_a1a,_a1b,"#28")
         VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
             VOP2(vbic.i32,_a0b,"#0xf0000000")
             "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
@@ -524,14 +524,14 @@ p448_sqr (
         VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
         VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
                         
-            VOP3(vsra.s64,_a0a,_a0b,"#28")
+            VOP3(vsra.u64,_a0a,_a0b,"#28")
             VOP2(vmovn.i64,_a0b_0,_a0b)
                                                                                             
             VOP2(vswp,_a1b_1,_a1a_0)
             VOP3(vadd.i64,_a0a,_a0a,_a1b)
 
             VOP2(vmovn.i64,_a0b_1,_a0a)
-            VOP3(vsra.s64,_a1a,_a0a,"#28")
+            VOP3(vsra.u64,_a1a,_a0a,"#28")
                                                                                             
             VOP2(vbic.i32,_a0b,"#0xf0000000") 
                                                                                             
diff --git a/test/test_arithmetic.c b/test/test_arithmetic.c
index d1bc3f2..ed88f66 100644
--- a/test/test_arithmetic.c
+++ b/test/test_arithmetic.c
@@ -132,12 +132,14 @@ static mask_t test_mul_sqr (
     const mpz_t y,
     word_t word
 ) {
-    field_a_t xx,yy,tt;
-    mpz_t t;
+    ANALYZE_THIS_ROUTINE_CAREFULLY;
+    field_a_t xx,yy,tt,zz;
+    mpz_t t, z;
     mask_t succ = MASK_SUCCESS;
     succ  = mpz_to_field(xx,x);
     succ &= mpz_to_field(yy,y);
     mpz_init(t);
+    mpz_init(z);
     
     field_mul(tt,xx,yy);
     mpz_mul(t,x,y);
@@ -150,17 +152,26 @@ static mask_t test_mul_sqr (
     field_sqr(tt,xx);
     mpz_mul(t,x,x);
     succ &= field_assert_eq_gmp("sqrx",xx,yy,tt,t,0,1.1);
-    
+
     field_sqr(tt,yy);
     mpz_mul(t,y,y);
     succ &= field_assert_eq_gmp("sqy",xx,yy,tt,t,0,1.1);
     
+    field_add_nr(zz,xx,xx);
+    mpz_add(z,x,x);
+    mpz_mul(t,z,z);
+    field_mul(tt,zz,zz);
+    succ &= field_assert_eq_gmp("msr4",xx,yy,tt,t,0,1.1);
+    field_sqr(tt,zz);
+    succ &= field_assert_eq_gmp("sqr4",xx,yy,tt,t,0,1.1);
+    
     if (!succ) {
         field_print("    x", xx);
         field_print("    y", yy);
     }
     
     mpz_clear(t);
+    mpz_clear(z);
     
     return succ;
 }

From 9ce5cbf53ca27a11f18b07f80b8c23ec938f0336 Mon Sep 17 00:00:00 2001
From: Mike Hamburg <mike@shiftleft.org>
Date: Sun, 22 Mar 2015 18:47:14 -0700
Subject: [PATCH 15/15] perf improvement in keygen, sign

---
 src/include/constant_time.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/include/constant_time.h b/src/include/constant_time.h
index 405c2f5..b114146 100644
--- a/src/include/constant_time.h
+++ b/src/include/constant_time.h
@@ -12,6 +12,7 @@
 #define __CONSTANT_TIME_H__ 1
 
 #include "word.h"
+#include <string.h>
 
 /*
  * Constant-time operations on hopefully-compile-time-sized memory
@@ -148,7 +149,7 @@ constant_time_lookup (
     const unsigned char *table = (const unsigned char *)table_;
     word_t j,k;
     
-    really_memset(out, 0, elem_bytes);
+    memset(out, 0, elem_bytes);
     for (j=0; j<n_table; j++, big_i-=big_one) {        
         big_register_t br_mask = br_is_zero(big_i);
         for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {