From 59ed8f566caa566285dc5a22a95c5362e2c44dde Mon Sep 17 00:00:00 2001 From: Michael Hamburg Date: Mon, 16 Mar 2015 16:23:41 -0700 Subject: [PATCH] change gf to a struct so that its alignment works on earlier clang --- Makefile | 3 +- include/decaf.h | 9 ++-- src/decaf.c | 113 +++++++++++++++++++++++------------------------ src/decaf_fast.c | 77 ++++++++++++++++---------------- test/bench.c | 10 ++--- 5 files changed, 108 insertions(+), 104 deletions(-) diff --git a/Makefile b/Makefile index ade95c9..107f4d9 100644 --- a/Makefile +++ b/Makefile @@ -70,7 +70,8 @@ LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \ build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \ build/f_arithmetic.o build/arithmetic.o -DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o +DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o build/decaf_tables.o \ + build/$(FIELD).o build/f_arithmetic.o # TODO TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \ build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o \ diff --git a/include/decaf.h b/include/decaf.h index b449b96..40203ea 100644 --- a/include/decaf.h +++ b/include/decaf.h @@ -60,10 +60,13 @@ typedef uint32_t decaf_word_t, decaf_bool_t; /** Number of bytes in a serialized scalar. */ #define DECAF_448_SCALAR_BYTES 56 +/** Galois field element internal structure */ +typedef struct gf_s { + decaf_word_t limb[DECAF_448_LIMBS]; +} __attribute__((aligned(32))) gf_s, gf[1]; + /** Twisted Edwards (-1,d-1) extended homogeneous coordinates */ -typedef struct decaf_448_point_s { - decaf_word_t x[DECAF_448_LIMBS],y[DECAF_448_LIMBS],z[DECAF_448_LIMBS],t[DECAF_448_LIMBS]; -} __attribute__((aligned(32))) decaf_448_point_t[1]; +typedef struct decaf_448_point_s { gf x,y,z,t; } decaf_448_point_t[1]; /** Precomputed table based on a point. Can be trivial implementation. */ struct decaf_448_precomputed_s; diff --git a/src/decaf.c b/src/decaf.c index 88585b6..d134447 100644 --- a/src/decaf.c +++ b/src/decaf.c @@ -33,15 +33,16 @@ typedef int64_t decaf_sdword_t; static const int QUADRATIC_NONRESIDUE = -1; #define sv static void -typedef decaf_word_t gf[DECAF_448_LIMBS]; -static const gf ZERO = {0}, ONE = {1}, TWO = {2}; +#define snv static void __attribute__((noinline)) +#define siv static inline void __attribute__((always_inline)) +static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; #define LMASK ((((decaf_word_t)1)<limb[i] = y->limb[i]); } /** Mostly-unoptimized multiply (PERF), but at least it's unrolled. */ -sv gf_mul (gf c, const gf a, const gf b) { +snv gf_mul (gf c, const gf a, const gf b) { gf aa; gf_cpy(aa,a); decaf_dword_t accum[DECAF_448_LIMBS] = {0}; FOR_LIMB(i, { - FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b[i] * aa[j]; }); - aa[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa[DECAF_448_LIMBS-1-i]; + FOR_LIMB(j,{ accum[(i+j)%DECAF_448_LIMBS] += (decaf_dword_t)b->limb[i] * aa->limb[j]; }); + aa->limb[(DECAF_448_LIMBS-1-i)^(DECAF_448_LIMBS/2)] += aa->limb[DECAF_448_LIMBS-1-i]; }); accum[DECAF_448_LIMBS-1] += accum[DECAF_448_LIMBS-2] >> LBITS; @@ -138,14 +137,14 @@ sv gf_mul (gf c, const gf a, const gf b) { accum[j] += accum[(j-1)%DECAF_448_LIMBS] >> LBITS; accum[(j-1)%DECAF_448_LIMBS] &= LMASK; }); - FOR_LIMB(j, c[j] = accum[j] ); + FOR_LIMB(j, c->limb[j] = accum[j] ); } /** No dedicated square (PERF) */ #define gf_sqr(c,a) gf_mul(c,a,a) /** Inverse square root using addition chain. */ -sv gf_isqrt(gf y, const gf x) { +snv gf_isqrt(gf y, const gf x) { int i; #define STEP(s,m,n) gf_mul(s,m,c); gf_cpy(c,s); for (i=0;i> LBITS; +siv gf_reduce(gf x) { + x->limb[DECAF_448_LIMBS/2] += x->limb[DECAF_448_LIMBS-1] >> LBITS; FOR_LIMB(j,{ - x[j] += x[(j-1)%DECAF_448_LIMBS] >> LBITS; - x[(j-1)%DECAF_448_LIMBS] &= LMASK; + x->limb[j] += x->limb[(j-1)%DECAF_448_LIMBS] >> LBITS; + x->limb[(j-1)%DECAF_448_LIMBS] &= LMASK; }); } /** Add mod p. Conservatively always weak-reduce. (PERF) */ sv gf_add ( gf x, const gf y, const gf z ) { - FOR_LIMB(i, x[i] = y[i] + z[i] ); + FOR_LIMB(i, x->limb[i] = y->limb[i] + z->limb[i] ); gf_reduce(x); } /** Subtract mod p. Conservatively always weak-reduce. (PERF) */ sv gf_sub ( gf x, const gf y, const gf z ) { - FOR_LIMB(i, x[i] = y[i] - z[i] + 2*P[i] ); + FOR_LIMB(i, x->limb[i] = y->limb[i] - z->limb[i] + 2*P->limb[i] ); gf_reduce(x); } /** Constant time, x = is_z ? z : y */ sv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) { - FOR_LIMB(i, x[i] = (y[i] & ~is_z) | (z[i] & is_z) ); + FOR_LIMB(i, x->limb[i] = (y->limb[i] & ~is_z) | (z->limb[i] & is_z) ); } /** Constant time, if (neg) x=-x; */ -sv cond_neg(gf x, decaf_bool_t neg) { +siv cond_neg(gf x, decaf_bool_t neg) { gf y; gf_sub(y,ZERO,x); cond_sel(x,x,y,neg); } /** Constant time, if (swap) (x,y) = (y,x); */ -sv cond_swap(gf x, gf y, decaf_bool_t swap) { +sv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { FOR_LIMB(i, { - decaf_word_t s = (x[i] ^ y[i]) & swap; - x[i] ^= s; - y[i] ^= s; + decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; + x->limb[i] ^= s; + y->limb[i] ^= s; }); } @@ -210,26 +209,26 @@ sv cond_swap(gf x, gf y, decaf_bool_t swap) { * Mul by signed int. Not constant-time WRT the sign of that int. * Just uses a full mul (PERF) */ -sv gf_mlw(gf a, const gf b, int w) { +siv gf_mlw(gf a, const gf b, int w) { if (w>0) { - gf ww = {w}; + gf ww = {{{w}}}; gf_mul(a,b,ww); } else { - gf ww = {-w}; + gf ww = {{{-w}}}; gf_mul(a,b,ww); gf_sub(a,ZERO,a); } } /** Canonicalize */ -sv gf_canon ( gf a ) { +snv gf_canon ( gf a ) { gf_reduce(a); /* subtract p with borrow */ decaf_sdword_t carry = 0; FOR_LIMB(i, { - carry = carry + a[i] - P[i]; - a[i] = carry & LMASK; + carry = carry + a->limb[i] - P->limb[i]; + a->limb[i] = carry & LMASK; carry >>= LBITS; }); @@ -238,8 +237,8 @@ sv gf_canon ( gf a ) { /* add it back */ FOR_LIMB(i, { - carry = carry + a[i] + (P[i] & addback); - a[i] = carry & LMASK; + carry = carry + a->limb[i] + (P->limb[i] & addback); + a->limb[i] = carry & LMASK; carry >>= LBITS; }); } @@ -250,7 +249,7 @@ static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) { gf_sub(c,a,b); gf_canon(c); decaf_word_t ret=0; - FOR_LIMB(i, ret |= c[i] ); + FOR_LIMB(i, ret |= c->limb[i] ); /* Hope the compiler is too dumb to optimize this, thus noinline */ return ((decaf_dword_t)ret - 1) >> WBITS; } @@ -260,7 +259,7 @@ static decaf_word_t hibit(const gf x) { gf y; gf_add(y,x,x); gf_canon(y); - return -(y[0]&1); + return -(y->limb[0]&1); } /* a = use_c ? c : b */ @@ -279,7 +278,7 @@ sv decaf_448_cond_sel ( /** {extra,accum} - sub +? p * Must have extra <= 1 */ -sv decaf_448_subx( +snv decaf_448_subx( decaf_448_scalar_t out, const decaf_word_t accum[DECAF_448_SCALAR_LIMBS], const decaf_448_scalar_t sub, @@ -303,7 +302,7 @@ sv decaf_448_subx( } } -sv decaf_448_montmul ( +snv decaf_448_montmul ( decaf_448_scalar_t out, const decaf_448_scalar_t a, const decaf_448_scalar_t b, @@ -400,7 +399,7 @@ decaf_bool_t decaf_448_scalar_eq ( /* *** API begins here *** */ /** identity = (0,1) */ -const decaf_448_point_t decaf_448_point_identity = {{{0},{1},{1},{0}}}; +const decaf_448_point_t decaf_448_point_identity = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}}; void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf_448_point_t p ) { /* Can shave off one mul here; not important but makes consistent with paper */ @@ -428,7 +427,7 @@ void decaf_448_point_encode( unsigned char ser[DECAF_448_SER_BYTES], const decaf int i, k=0, bits=0; decaf_dword_t buf=0; for (i=0; ilimb[i]<=8 || i==DECAF_448_LIMBS-1) && k>=8) { ser[k++]=buf; } @@ -444,17 +443,17 @@ static decaf_bool_t gf_deser(gf s, const unsigned char ser[DECAF_448_SER_BYTES]) for (i=0; i=LBITS || i==DECAF_448_SER_BYTES-1) && k>=LBITS) { - s[k++] = buf & LMASK; + s->limb[k++] = buf & LMASK; } } decaf_sdword_t accum = 0; - FOR_LIMB(i, accum = (accum + s[i] - P[i]) >> WBITS ); + FOR_LIMB(i, accum = (accum + s->limb[i] - P->limb[i]) >> WBITS ); return accum; } /* Constant-time add or subtract */ -sv decaf_448_point_add_sub ( +snv decaf_448_point_add_sub ( decaf_448_point_t p, const decaf_448_point_t q, const decaf_448_point_t r, @@ -512,7 +511,7 @@ decaf_bool_t decaf_448_point_decode ( gf_mul ( a, b, c ); gf_mul ( p->y,a,p->z ); gf_mul ( p->t,p->x,a ); - p->y[0] -= zero; + p->y->limb[0] -= zero; /* TODO: do something safe if ~succ? */ return succ; } diff --git a/src/decaf_fast.c b/src/decaf_fast.c index 254f4b1..e4d4440 100644 --- a/src/decaf_fast.c +++ b/src/decaf_fast.c @@ -40,15 +40,14 @@ static const int QUADRATIC_NONRESIDUE = -1; #define sv static void #define snv static void __attribute__((noinline)) #define siv static inline void __attribute__((always_inline)) -typedef decaf_word_t gf[DECAF_448_LIMBS] __attribute__((aligned(32))); -static const gf ZERO = {0}, ONE = {1}, TWO = {2}; +static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; #define LMASK ((((decaf_word_t)1)<limb[i] = y->limb[i]); } /** Mostly-unoptimized multiply, but at least it's unrolled. */ siv gf_mul (gf c, const gf a, const gf b) { @@ -188,18 +187,21 @@ sv cond_neg(gf x, decaf_bool_t neg) { } /** Constant time, if (swap) (x,y) = (y,x); */ -siv cond_swap(gf x, decaf_word_t *__restrict__ y, decaf_bool_t swap) { +siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { int i; #ifdef __clang__ #if 10*__clang_major__ + __clang_minor__ > 35 - _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(4) interleave_count(2)") + _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") #endif #endif for (i=0; ilimb[i] ^ y->limb[i]) & swap; + x->limb[i] ^= s; + y->limb[i] ^= s; } + /* + constant_time_cond_swap(x,y,sizeof(gf),swap); + */ } /** @@ -226,7 +228,7 @@ static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) { gf_sub(c,a,b); gf_canon(c); decaf_word_t ret=0; - FOR_LIMB(i, ret |= c[i] ); + FOR_LIMB(i, ret |= c->limb[i] ); /* Hope the compiler is too dumb to optimize this, thus noinline */ return ((decaf_dword_t)ret - 1) >> WBITS; } @@ -236,15 +238,13 @@ static decaf_word_t hibit(const gf x) { gf y; gf_add(y,x,x); gf_canon(y); - return -(y[0]&1); + return -(y->limb[0]&1); } /** Return high bit of x/2 = low bit of x mod p */ -static decaf_word_t lobit(const gf x) { - gf y; - gf_cpy(y,x); - gf_canon(y); - return -(y[0]&1); +static inline decaf_word_t lobit(gf x) { + gf_canon(x); + return -(x->limb[0]&1); } /* a = use_c ? c : b */ @@ -403,14 +403,14 @@ decaf_bool_t decaf_448_scalar_eq ( /* *** API begins here *** */ /** identity = (0,1) */ -const decaf_448_point_t decaf_448_point_identity = {{{0},{1},{1},{0}}}; +const decaf_448_point_t decaf_448_point_identity = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}}; static void gf_encode ( unsigned char ser[DECAF_448_SER_BYTES], gf a ) { gf_canon(a); int i, k=0, bits=0; decaf_dword_t buf=0; for (i=0; ilimb[i]<=8 || i==DECAF_448_LIMBS-1) && k>=8) { ser[k++]=buf; } @@ -450,12 +450,12 @@ static decaf_bool_t gf_deser(gf s, const unsigned char ser[DECAF_448_SER_BYTES]) for (i=0; i=LBITS || i==DECAF_448_SER_BYTES-1) && k>=LBITS) { - s[k++] = buf & LMASK; + s->limb[k++] = buf & LMASK; } } decaf_sdword_t accum = 0; - FOR_LIMB(i, accum = (accum + s[i] - P[i]) >> WBITS ); + FOR_LIMB(i, accum = (accum + s->limb[i] - P->limb[i]) >> WBITS ); return accum; } @@ -518,7 +518,7 @@ decaf_bool_t decaf_448_point_decode ( gf_mul ( a, b, c ); gf_mul ( p->y,a,p->z ); gf_mul ( p->t,p->x,a ); - p->y[0] -= zero; + p->y->limb[0] -= zero; /* TODO: do something safe if ~succ? */ return succ; } @@ -902,7 +902,8 @@ void decaf_448_point_from_hash_nonuniform ( (void)gf_deser(r,ser); gf_canon(r); gf_sqr(a,r); - gf_mlw(urr,a,QUADRATIC_NONRESIDUE); + /* gf_mlw(urr,a,QUADRATIC_NONRESIDUE); */ + gf_sub(urr,ZERO,a); gf_mlw(dee,ONE,EDWARDS_D); gf_add(a,urr,ONE); gf_sub(ur2_d,dee,urr); @@ -1185,7 +1186,7 @@ decaf_bool_t decaf_448_direct_scalarmul ( gf_mul(xz_d, xd, zd); gf_mul(xz_a, xa, za); output_zero = gf_eq(xz_d, ZERO); - xz_d[0] -= output_zero; /* make xz_d always nonzero */ + xz_d->limb[0] -= output_zero; /* make xz_d always nonzero */ zcase = output_zero | gf_eq(xz_a, ZERO); za_zero = gf_eq(za, ZERO); diff --git a/test/bench.c b/test/bench.c index c2812f7..6a1bed2 100644 --- a/test/bench.c +++ b/test/bench.c @@ -704,7 +704,7 @@ int main(int argc, char **argv) { unsigned char dshared[2][32]; when = now(); - for (i=0; i