From 59ed8f566caa566285dc5a22a95c5362e2c44dde Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Mon, 16 Mar 2015 16:23:41 -0700
Subject: [PATCH] change gf to a struct so that its alignment works on earlier
 clang

---
 Makefile         |   3 +-
 include/decaf.h  |   9 ++--
 src/decaf.c      | 113 +++++++++++++++++++++++------------------------
 src/decaf_fast.c |  77 ++++++++++++++++----------------
 test/bench.c     |  10 ++---
 5 files changed, 108 insertions(+), 104 deletions(-)

diff --git a/Makefile b/Makefile
index ade95c9..107f4d9 100644
--- a/Makefile
+++ b/Makefile
@@ -70,7 +70,8 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
   build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \
 	build/f_arithmetic.o build/arithmetic.o
 
-DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o
+DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o \
+	build/$(FIELD).o build/f_arithmetic.o # TODO
 
 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o \
diff --git a/include/decaf.h b/include/decaf.h
index b449b96..40203ea 100644
--- a/include/decaf.h
+++ b/include/decaf.h
@@ -60,10 +60,13 @@ typedef uint32_t decaf_word_t, decaf_bool_t;
 /** Number of bytes in a serialized scalar. */
 #define DECAF_448_SCALAR_BYTES 56
 
+/** Galois field element internal structure */
+typedef struct gf_s {
+    decaf_word_t limb[DECAF_448_LIMBS];
+} __attribute__((aligned(32))) gf_s, gf[1];
+
 /** Twisted Edwards (-1,d-1) extended homogeneous coordinates */
-typedef struct decaf_448_point_s {
-    decaf_word_t x[DECAF_448_LIMBS],y[DECAF_448_LIMBS],z[DECAF_448_LIMBS],t[DECAF_448_LIMBS];
-} __attribute__((aligned(32))) decaf_448_point_t[1];
+typedef struct decaf_448_point_s { gf x,y,z,t; } decaf_448_point_t[1];
 
 /** Precomputed table based on a point.  Can be trivial implementation. */
 struct decaf_448_precomputed_s;
diff --git a/src/decaf.c b/src/decaf.c
index 88585b6..d134447 100644
--- a/src/decaf.c
+++ b/src/decaf.c
@@ -33,15 +33,16 @@ typedef int64_t decaf_sdword_t;
 static const int QUADRATIC_NONRESIDUE = -1;
 
 #define sv static void
-typedef decaf_word_t gf[DECAF_448_LIMBS];
-static const gf ZERO = {0}, ONE = {1}, TWO = {2};
+#define snv static void __attribute__((noinline))
+#define siv static inline void __attribute__((always_inline))
+static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};
 
 #define LMASK ((((decaf_word_t)1)<<LBITS)-1)
 #if WBITS == 64
-static const gf P = { LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK };
+static const gf P = {{{ LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK }}};
 #else
-static const gf P = { LMASK,   LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK,
-		      LMASK-1, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK };
+static const gf P = {{{ LMASK,   LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK,
+		      LMASK-1, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK }}};
 #endif
 static const int EDWARDS_D = -39081;
 
@@ -70,24 +71,22 @@ static const decaf_word_t DECAF_MONTGOMERY_FACTOR = (decaf_word_t)(0x3bd440fae91
 /** base = twist of Goldilocks base point (~,19). */
 
 const decaf_448_point_t decaf_448_point_base = {{
-    { LIMB(0xb39a2d57e08c7b),LIMB(0xb38639c75ff281),
-      LIMB(0x2ec981082b3288),LIMB(0x99fe8607e5237c),
-      LIMB(0x0e33fbb1fadd1f),LIMB(0xe714f67055eb4a),
-      LIMB(0xc9ae06d64067dd),LIMB(0xf7be45054760fa) },
-    { LIMB(0xbd8715f551617f),LIMB(0x8c17fbeca8f5fc),
-      LIMB(0xaae0eec209c06f),LIMB(0xce41ad80cbe6b8),
-      LIMB(0xdf360b5c828c00),LIMB(0xaf25b6bbb40e3b),
-      LIMB(0x8ed37f0ce4ed31),LIMB(0x72a1c3214557b9) },
-    { 1 },
-    { LIMB(0x97ca9c8ed8bde9),LIMB(0xf0b780da83304c),
-      LIMB(0x0d79c0a7729a69),LIMB(0xc18d3f24aebc1c),
-      LIMB(0x1fbb5389b3fda5),LIMB(0xbb24f674635948),
-      LIMB(0x723a55709a3983),LIMB(0xe1c0107a823dd4) }
+    {{{ LIMB(0xb39a2d57e08c7b),LIMB(0xb38639c75ff281),
+        LIMB(0x2ec981082b3288),LIMB(0x99fe8607e5237c),
+        LIMB(0x0e33fbb1fadd1f),LIMB(0xe714f67055eb4a),
+        LIMB(0xc9ae06d64067dd),LIMB(0xf7be45054760fa) }}},
+    {{{ LIMB(0xbd8715f551617f),LIMB(0x8c17fbeca8f5fc),
+        LIMB(0xaae0eec209c06f),LIMB(0xce41ad80cbe6b8),
+        LIMB(0xdf360b5c828c00),LIMB(0xaf25b6bbb40e3b),
+        LIMB(0x8ed37f0ce4ed31),LIMB(0x72a1c3214557b9) }}},
+    {{{ 1 }}},
+    {{{ LIMB(0x97ca9c8ed8bde9),LIMB(0xf0b780da83304c),
+        LIMB(0x0d79c0a7729a69),LIMB(0xc18d3f24aebc1c),
+        LIMB(0x1fbb5389b3fda5),LIMB(0xbb24f674635948),
+        LIMB(0x723a55709a3983),LIMB(0xe1c0107a823dd4) }}}
 }};
 
-struct decaf_448_precomputed_s {
-    decaf_448_point_t p[1];
-};
+struct decaf_448_precomputed_s { decaf_448_point_t p[1]; };
 
 /* FIXME: restore */
 // const struct decaf_448_precomputed_s *decaf_448_precomputed_base =
@@ -118,17 +117,17 @@ const size_t alignof_decaf_448_precomputed_s = 32;
 #endif
 
 /** Copy x = y */
-sv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }
+siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x->limb[i] = y->limb[i]); }
 
 /** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */
-sv gf_mul (gf c, const gf a, const gf b) {
+snv gf_mul (gf c, const gf a, const gf b) {
     gf aa;
     gf_cpy(aa,a);
     
     decaf_dword_t accum[DECAF_448_LIMBS] = {0};
     FOR_LIMB(i, {
-        FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b[i] * aa[j]; });
-        aa[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa[DECAF_448_LIMBS-1-i];
+        FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; });
+        aa->limb[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa->limb[DECAF_448_LIMBS-1-i];
     });
     
     accum[DECAF_448_LIMBS-1] += accum[DECAF_448_LIMBS-2] >> LBITS;
@@ -138,14 +137,14 @@ sv gf_mul (gf c, const gf a, const gf b) {
         accum[j] += accum[(j-1)%DECAF_448_LIMBS] >> LBITS;
         accum[(j-1)%DECAF_448_LIMBS] &= LMASK;
     });
-    FOR_LIMB(j, c[j] = accum[j] );
+    FOR_LIMB(j, c->limb[j] = accum[j] );
 }
 
 /** No dedicated square (PERF) */
 #define gf_sqr(c,a) gf_mul(c,a,a)
 
 /** Inverse square root using addition chain. */
-sv gf_isqrt(gf y, const gf x) {
+snv gf_isqrt(gf y, const gf x) {
     int i;
 #define STEP(s,m,n) gf_mul(s,m,c); gf_cpy(c,s); for (i=0;i<n;i++) gf_sqr(c,c);
     gf a, b, c;
@@ -165,44 +164,44 @@ sv gf_isqrt(gf y, const gf x) {
 }
 
 /** Weak reduce mod p. */
-sv gf_reduce(gf x) {
-    x[DECAF_448_LIMBS/2] += x[DECAF_448_LIMBS-1] >> LBITS;
+siv gf_reduce(gf x) {
+    x->limb[DECAF_448_LIMBS/2] += x->limb[DECAF_448_LIMBS-1] >> LBITS;
     FOR_LIMB(j,{
-        x[j] += x[(j-1)%DECAF_448_LIMBS] >> LBITS;
-        x[(j-1)%DECAF_448_LIMBS] &= LMASK;
+        x->limb[j] += x->limb[(j-1)%DECAF_448_LIMBS] >> LBITS;
+        x->limb[(j-1)%DECAF_448_LIMBS] &= LMASK;
     });
 }
 
 /** Add mod p.  Conservatively always weak-reduce. (PERF) */
 sv gf_add ( gf x, const gf y, const gf z ) {
-    FOR_LIMB(i, x[i] = y[i] + z[i] );
+    FOR_LIMB(i, x->limb[i] = y->limb[i] + z->limb[i] );
     gf_reduce(x);
 }
 
 /** Subtract mod p.  Conservatively always weak-reduce. (PERF) */
 sv gf_sub ( gf x, const gf y, const gf z ) {
-    FOR_LIMB(i, x[i] = y[i] - z[i] + 2*P[i] );
+    FOR_LIMB(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] );
     gf_reduce(x);
 }
 
 /** Constant time, x = is_z ? z : y */
 sv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
-    FOR_LIMB(i, x[i] = (y[i] & ~is_z) | (z[i] & is_z) );
+    FOR_LIMB(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) );
 }
 
 /** Constant time, if (neg) x=-x; */
-sv cond_neg(gf x, decaf_bool_t neg) {
+siv cond_neg(gf x, decaf_bool_t neg) {
     gf y;
     gf_sub(y,ZERO,x);
     cond_sel(x,x,y,neg);
 }
 
 /** Constant time, if (swap) (x,y) = (y,x); */
-sv cond_swap(gf x, gf y, decaf_bool_t swap) {
+sv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
     FOR_LIMB(i, {
-        decaf_word_t s = (x[i] ^ y[i]) & swap;
-        x[i] ^= s;
-        y[i] ^= s;
+        decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
+        x->limb[i] ^= s;
+        y->limb[i] ^= s;
     });
 }
 
@@ -210,26 +209,26 @@ sv cond_swap(gf x, gf y, decaf_bool_t swap) {
  * Mul by signed int.  Not constant-time WRT the sign of that int.
  * Just uses a full mul (PERF)
  */
-sv gf_mlw(gf a, const gf b, int w) {
+siv gf_mlw(gf a, const gf b, int w) {
     if (w>0) {
-        gf ww = {w};
+        gf ww = {{{w}}};
         gf_mul(a,b,ww);
     } else {
-        gf ww = {-w};
+        gf ww = {{{-w}}};
         gf_mul(a,b,ww);
         gf_sub(a,ZERO,a);
     }
 }
 
 /** Canonicalize */
-sv gf_canon ( gf a ) {
+snv gf_canon ( gf a ) {
     gf_reduce(a);
 
     /* subtract p with borrow */
     decaf_sdword_t carry = 0;
     FOR_LIMB(i, {
-        carry = carry + a[i] - P[i];
-        a[i] = carry & LMASK;
+        carry = carry + a->limb[i] - P->limb[i];
+        a->limb[i] = carry & LMASK;
         carry >>= LBITS;
     });
     
@@ -238,8 +237,8 @@ sv gf_canon ( gf a ) {
 
     /* add it back */
     FOR_LIMB(i, {
-        carry = carry + a[i] + (P[i] & addback);
-        a[i] = carry & LMASK;
+        carry = carry + a->limb[i] + (P->limb[i] & addback);
+        a->limb[i] = carry & LMASK;
         carry >>= LBITS;
     });
 }
@@ -250,7 +249,7 @@ static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
     gf_sub(c,a,b);
     gf_canon(c);
     decaf_word_t ret=0;
-    FOR_LIMB(i, ret |= c[i] );
+    FOR_LIMB(i, ret |= c->limb[i] );
     /* Hope the compiler is too dumb to optimize this, thus noinline */
     return ((decaf_dword_t)ret - 1) >> WBITS;
 }
@@ -260,7 +259,7 @@ static decaf_word_t hibit(const gf x) {
     gf y;
     gf_add(y,x,x);
     gf_canon(y);
-    return -(y[0]&1);
+    return -(y->limb[0]&1);
 }
 
 /* a = use_c ? c : b */
@@ -279,7 +278,7 @@ sv decaf_448_cond_sel (
 /** {extra,accum} - sub +? p
  * Must have extra <= 1
  */
-sv decaf_448_subx(
+snv decaf_448_subx(
     decaf_448_scalar_t out,
     const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
     const decaf_448_scalar_t sub,
@@ -303,7 +302,7 @@ sv decaf_448_subx(
     }
 }
 
-sv decaf_448_montmul (
+snv decaf_448_montmul (
     decaf_448_scalar_t out,
     const decaf_448_scalar_t a,
     const decaf_448_scalar_t b,
@@ -400,7 +399,7 @@ decaf_bool_t decaf_448_scalar_eq (
 /* *** API begins here *** */    
 
 /** identity = (0,1) */
-const decaf_448_point_t decaf_448_point_identity = {{{0},{1},{1},{0}}};
+const decaf_448_point_t decaf_448_point_identity = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
 
 void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf_448_point_t p ) {
     /* Can shave off one mul here; not important but makes consistent with paper */
@@ -428,7 +427,7 @@ void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf
     int i, k=0, bits=0;
     decaf_dword_t buf=0;
     for (i=0; i<DECAF_448_LIMBS; i++) {
-        buf |= (decaf_dword_t)a[i]<<bits;
+        buf |= (decaf_dword_t)a->limb[i]<<bits;
         for (bits += LBITS; (bits>=8 || i==DECAF_448_LIMBS-1) && k<DECAF_448_SER_BYTES; bits-=8, buf>>=8) {
             ser[k++]=buf;
         }
@@ -444,17 +443,17 @@ static decaf_bool_t gf_deser(gf s, const unsigned char ser[DECAF_448_SER_BYTES])
     for (i=0; i<DECAF_448_SER_BYTES; i++) {
         buf |= (decaf_dword_t)ser[i]<<bits;
         for (bits += 8; (bits>=LBITS || i==DECAF_448_SER_BYTES-1) && k<DECAF_448_LIMBS; bits-=LBITS, buf>>=LBITS) {
-            s[k++] = buf & LMASK;
+            s->limb[k++] = buf & LMASK;
         }
     }
     
     decaf_sdword_t accum = 0;
-    FOR_LIMB(i, accum = (accum + s[i] - P[i]) >> WBITS );
+    FOR_LIMB(i, accum = (accum + s->limb[i] - P->limb[i]) >> WBITS );
     return accum;
 }
     
 /* Constant-time add or subtract */
-sv decaf_448_point_add_sub (
+snv decaf_448_point_add_sub (
     decaf_448_point_t p,
     const decaf_448_point_t q,
     const decaf_448_point_t r,
@@ -512,7 +511,7 @@ decaf_bool_t decaf_448_point_decode (
     gf_mul ( a, b, c );
     gf_mul ( p->y,a,p->z );
     gf_mul ( p->t,p->x,a );
-    p->y[0] -= zero;
+    p->y->limb[0] -= zero;
     /* TODO: do something safe if ~succ? */
     return succ;
 }
diff --git a/src/decaf_fast.c b/src/decaf_fast.c
index 254f4b1..e4d4440 100644
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -40,15 +40,14 @@ static const int QUADRATIC_NONRESIDUE = -1;
 #define sv static void
 #define snv static void __attribute__((noinline))
 #define siv static inline void __attribute__((always_inline))
-typedef decaf_word_t gf[DECAF_448_LIMBS] __attribute__((aligned(32)));
-static const gf ZERO = {0}, ONE = {1}, TWO = {2};
+static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};
 
 #define LMASK ((((decaf_word_t)1)<<LBITS)-1)
 #if WBITS == 64
-static const gf P = { LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK };
+static const gf P = {{{ LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK }}};
 #else
-static const gf P = { LMASK,   LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK,
-		      LMASK-1, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK };
+static const gf P = {{{ LMASK,   LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK,
+	LMASK-1, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK, LMASK }}};
 #endif
 static const int EDWARDS_D = -39081;
 
@@ -77,19 +76,19 @@ static const decaf_word_t DECAF_MONTGOMERY_FACTOR = (decaf_word_t)(0x3bd440fae91
 /** base = twist of Goldilocks base point (~,19). */
 
 const decaf_448_point_t decaf_448_point_base = {{
-    { LIMB(0xb39a2d57e08c7b),LIMB(0xb38639c75ff281),
-      LIMB(0x2ec981082b3288),LIMB(0x99fe8607e5237c),
-      LIMB(0x0e33fbb1fadd1f),LIMB(0xe714f67055eb4a),
-      LIMB(0xc9ae06d64067dd),LIMB(0xf7be45054760fa) },
-    { LIMB(0xbd8715f551617f),LIMB(0x8c17fbeca8f5fc),
-      LIMB(0xaae0eec209c06f),LIMB(0xce41ad80cbe6b8),
-      LIMB(0xdf360b5c828c00),LIMB(0xaf25b6bbb40e3b),
-      LIMB(0x8ed37f0ce4ed31),LIMB(0x72a1c3214557b9) },
-    { 1 },
-    { LIMB(0x97ca9c8ed8bde9),LIMB(0xf0b780da83304c),
-      LIMB(0x0d79c0a7729a69),LIMB(0xc18d3f24aebc1c),
-      LIMB(0x1fbb5389b3fda5),LIMB(0xbb24f674635948),
-      LIMB(0x723a55709a3983),LIMB(0xe1c0107a823dd4) }
+    {{{ LIMB(0xb39a2d57e08c7b),LIMB(0xb38639c75ff281),
+        LIMB(0x2ec981082b3288),LIMB(0x99fe8607e5237c),
+        LIMB(0x0e33fbb1fadd1f),LIMB(0xe714f67055eb4a),
+        LIMB(0xc9ae06d64067dd),LIMB(0xf7be45054760fa) }}},
+    {{{ LIMB(0xbd8715f551617f),LIMB(0x8c17fbeca8f5fc),
+        LIMB(0xaae0eec209c06f),LIMB(0xce41ad80cbe6b8),
+        LIMB(0xdf360b5c828c00),LIMB(0xaf25b6bbb40e3b),
+        LIMB(0x8ed37f0ce4ed31),LIMB(0x72a1c3214557b9) }}},
+    {{{ 1 }}},
+    {{{ LIMB(0x97ca9c8ed8bde9),LIMB(0xf0b780da83304c),
+        LIMB(0x0d79c0a7729a69),LIMB(0xc18d3f24aebc1c),
+        LIMB(0x1fbb5389b3fda5),LIMB(0xbb24f674635948),
+        LIMB(0x723a55709a3983),LIMB(0xe1c0107a823dd4) }}}
 }};
 
 /* Projective Niels coordinates */
@@ -122,7 +121,7 @@ const size_t alignof_decaf_448_precomputed_s = 32;
 #endif
 
 /** Copy x = y */
-siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }
+siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x->limb[i] = y->limb[i]); }
 
 /** Mostly-unoptimized multiply, but at least it's unrolled. */
 siv gf_mul (gf c, const gf a, const gf b) {
@@ -188,18 +187,21 @@ sv cond_neg(gf x, decaf_bool_t neg) {
 }
 
 /** Constant time, if (swap) (x,y) = (y,x); */
-siv cond_swap(gf x, decaf_word_t *__restrict__ y, decaf_bool_t swap) {
+siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
     int i;
 #ifdef __clang__
 #if 10*__clang_major__ + __clang_minor__ > 35
-    _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(4) interleave_count(2)")
+    _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)")
 #endif
 #endif
     for (i=0; i<DECAF_448_LIMBS; i++) {
-        decaf_word_t s = (x[i] ^ y[i]) & swap;
-        x[i] ^= s;
-        y[i] ^= s;
+        decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
+        x->limb[i] ^= s;
+        y->limb[i] ^= s;
     }
+    /*
+    constant_time_cond_swap(x,y,sizeof(gf),swap);
+    */
 }
 
 /**
@@ -226,7 +228,7 @@ static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
     gf_sub(c,a,b);
     gf_canon(c);
     decaf_word_t ret=0;
-    FOR_LIMB(i, ret |= c[i] );
+    FOR_LIMB(i, ret |= c->limb[i] );
     /* Hope the compiler is too dumb to optimize this, thus noinline */
     return ((decaf_dword_t)ret - 1) >> WBITS;
 }
@@ -236,15 +238,13 @@ static decaf_word_t hibit(const gf x) {
     gf y;
     gf_add(y,x,x);
     gf_canon(y);
-    return -(y[0]&1);
+    return -(y->limb[0]&1);
 }
 
 /** Return high bit of x/2 = low bit of x mod p */
-static decaf_word_t lobit(const gf x) {
-    gf y;
-    gf_cpy(y,x);
-    gf_canon(y);
-    return -(y[0]&1);
+static inline decaf_word_t lobit(gf x) {
+    gf_canon(x);
+    return -(x->limb[0]&1);
 }
 
 /* a = use_c ? c : b */
@@ -403,14 +403,14 @@ decaf_bool_t decaf_448_scalar_eq (
 /* *** API begins here *** */    
 
 /** identity = (0,1) */
-const decaf_448_point_t decaf_448_point_identity = {{{0},{1},{1},{0}}};
+const decaf_448_point_t decaf_448_point_identity = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
 
 static void gf_encode ( unsigned char ser[DECAF_448_SER_BYTES], gf a ) {
     gf_canon(a);
     int i, k=0, bits=0;
     decaf_dword_t buf=0;
     for (i=0; i<DECAF_448_LIMBS; i++) {
-        buf |= (decaf_dword_t)a[i]<<bits;
+        buf |= (decaf_dword_t)a->limb[i]<<bits;
         for (bits += LBITS; (bits>=8 || i==DECAF_448_LIMBS-1) && k<DECAF_448_SER_BYTES; bits-=8, buf>>=8) {
             ser[k++]=buf;
         }
@@ -450,12 +450,12 @@ static decaf_bool_t gf_deser(gf s, const unsigned char ser[DECAF_448_SER_BYTES])
     for (i=0; i<DECAF_448_SER_BYTES; i++) {
         buf |= (decaf_dword_t)ser[i]<<bits;
         for (bits += 8; (bits>=LBITS || i==DECAF_448_SER_BYTES-1) && k<DECAF_448_LIMBS; bits-=LBITS, buf>>=LBITS) {
-            s[k++] = buf & LMASK;
+            s->limb[k++] = buf & LMASK;
         }
     }
     
     decaf_sdword_t accum = 0;
-    FOR_LIMB(i, accum = (accum + s[i] - P[i]) >> WBITS );
+    FOR_LIMB(i, accum = (accum + s->limb[i] - P->limb[i]) >> WBITS );
     return accum;
 }
     
@@ -518,7 +518,7 @@ decaf_bool_t decaf_448_point_decode (
     gf_mul ( a, b, c );
     gf_mul ( p->y,a,p->z );
     gf_mul ( p->t,p->x,a );
-    p->y[0] -= zero;
+    p->y->limb[0] -= zero;
     /* TODO: do something safe if ~succ? */
     return succ;
 }
@@ -902,7 +902,8 @@ void decaf_448_point_from_hash_nonuniform (
     (void)gf_deser(r,ser);
     gf_canon(r);
     gf_sqr(a,r);
-    gf_mlw(urr,a,QUADRATIC_NONRESIDUE);
+    /* gf_mlw(urr,a,QUADRATIC_NONRESIDUE); */
+    gf_sub(urr,ZERO,a);
     gf_mlw(dee,ONE,EDWARDS_D);
     gf_add(a,urr,ONE);
     gf_sub(ur2_d,dee,urr);
@@ -1185,7 +1186,7 @@ decaf_bool_t decaf_448_direct_scalarmul (
     gf_mul(xz_d, xd, zd);
     gf_mul(xz_a, xa, za);
     output_zero = gf_eq(xz_d, ZERO);
-    xz_d[0] -= output_zero; /* make xz_d always nonzero */
+    xz_d->limb[0] -= output_zero; /* make xz_d always nonzero */
     zcase = output_zero | gf_eq(xz_a, ZERO);
     za_zero = gf_eq(za, ZERO);
 
diff --git a/test/bench.c b/test/bench.c
index c2812f7..6a1bed2 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -704,7 +704,7 @@ int main(int argc, char **argv) {
     unsigned char dshared[2][32];
     
     when = now();
-    for (i=0; i<nbase/10; i++) {
+    for (i=0; i<nbase; i++) {
         decaf_448_derive_private_key(dpriv[i&1], sym[i&1]);
     }
     when = now() - when;
@@ -714,7 +714,7 @@ int main(int argc, char **argv) {
     decaf_448_private_to_public(dpub[1], dpriv[1]);
     
     when = now();
-    for (i=0; i<nbase/10; i++) {
+    for (i=0; i<nbase; i++) {
         decaf_bool_t ret = decaf_448_shared_secret(dshared[i&1], 32, dpriv[i&1], dpub[(i+1)&1]);
         if (ret != DECAF_SUCCESS) {
             printf("BUG: shared secret returns failure on %d.\n", i);
@@ -732,7 +732,7 @@ int main(int argc, char **argv) {
     const char *dmessage = "hello world";
     const char *dnessage = "Jello world";
     when = now();
-    for (i=0; i<nbase/10; i++) {
+    for (i=0; i<nbase; i++) {
         decaf_448_sign(dsig, dpriv[0], (const unsigned char *)dmessage, 11);
     }
     when = now() - when;
@@ -743,7 +743,7 @@ int main(int argc, char **argv) {
     }
     
     when = now();
-    for (i=0; i<nbase/10; i++) {
+    for (i=0; i<nbase; i++) {
         decaf_bool_t ret = decaf_448_verify(dsig, dpub[0],
             (const unsigned char *)((i&1) ? dmessage : dnessage), 11);
         if ((i&1) && ~ret) {
@@ -762,7 +762,7 @@ int main(int argc, char **argv) {
         alignof_decaf_448_precomputed_s, sizeof_decaf_448_precomputed_s));
     assert(dpre);
     when = now();
-    for (i=0; i<nbase/10; i++) {
+    for (i=0; i<nbase; i++) {
         decaf_448_precompute(dpre, Da);
     }
     when = now() - when;