| @@ -109,13 +109,6 @@ cond_neg(gf x, mask_t neg) { | |||||
| static INLINE void | static INLINE void | ||||
| cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { | cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) { | ||||
| constant_time_cond_swap(x,y,sizeof(gf_s),swap); | constant_time_cond_swap(x,y,sizeof(gf_s),swap); | ||||
| /* | |||||
| UNROLL for (unsigned int i=0; i<sizeof(x->limb)/sizeof(x->limb[0]); i++) { | |||||
| decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | |||||
| x->limb[i] ^= s; | |||||
| y->limb[i] ^= s; | |||||
| } | |||||
| */ | |||||
| } | } | ||||
| /** Inverse square root using addition chain. */ | /** Inverse square root using addition chain. */ | ||||
| @@ -133,7 +126,7 @@ static void | |||||
| gf_invert(gf y, const gf x) { | gf_invert(gf y, const gf x) { | ||||
| gf t1, t2; | gf t1, t2; | ||||
| gf_sqr(t1, x); // o^2 | gf_sqr(t1, x); // o^2 | ||||
| decaf_bool_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||||
| mask_t ret = gf_isqrt_chk(t2, t1, 0); // +-1/sqrt(o^2) = +-1/o | |||||
| (void)ret; assert(ret); | (void)ret; assert(ret); | ||||
| gf_sqr(t1, t2); | gf_sqr(t1, t2); | ||||
| gf_mul(t2, t1, x); // not direct to y in case of alias. | gf_mul(t2, t1, x); // not direct to y in case of alias. | ||||
| @@ -142,7 +135,7 @@ gf_invert(gf y, const gf x) { | |||||
| /** Mul by signed int. Not constant-time WRT the sign of that int. */ | /** Mul by signed int. Not constant-time WRT the sign of that int. */ | ||||
| static INLINE void | static INLINE void | ||||
| gf_mulw_sgn(gf c, const gf a, int w) { | |||||
| gf_mulw_sgn(gf c, const gf a, int32_t w) { | |||||
| if (w>0) { | if (w>0) { | ||||
| gf_mulw(c, a, w); | gf_mulw(c, a, w); | ||||
| } else { | } else { | ||||
| @@ -152,7 +145,7 @@ gf_mulw_sgn(gf c, const gf a, int w) { | |||||
| } | } | ||||
| /** Return high bit of x = low bit of 2x mod p */ | /** Return high bit of x = low bit of 2x mod p */ | ||||
| static decaf_word_t hibit(const gf x) { | |||||
| static mask_t hibit(const gf x) { | |||||
| gf y; | gf y; | ||||
| gf_add(y,x,x); | gf_add(y,x,x); | ||||
| gf_strong_reduce(y); | gf_strong_reduce(y); | ||||
| @@ -161,7 +154,7 @@ static decaf_word_t hibit(const gf x) { | |||||
| #if COFACTOR==8 | #if COFACTOR==8 | ||||
| /** Return high bit of x = low bit of 2x mod p */ | /** Return high bit of x = low bit of 2x mod p */ | ||||
| static decaf_word_t lobit(const gf x) { | |||||
| static mask_t lobit(const gf x) { | |||||
| gf y; | gf y; | ||||
| gf_copy(y,x); | gf_copy(y,x); | ||||
| gf_strong_reduce(y); | gf_strong_reduce(y); | ||||
| @@ -873,9 +866,9 @@ static INLINE void | |||||
| constant_time_lookup_xx ( | constant_time_lookup_xx ( | ||||
| void *__restrict__ out_, | void *__restrict__ out_, | ||||
| const void *table_, | const void *table_, | ||||
| decaf_word_t elem_bytes, | |||||
| decaf_word_t n_table, | |||||
| decaf_word_t idx | |||||
| word_t elem_bytes, | |||||
| word_t n_table, | |||||
| word_t idx | |||||
| ) { | ) { | ||||
| constant_time_lookup(out_,table_,elem_bytes,n_table,idx); | constant_time_lookup(out_,table_,elem_bytes,n_table,idx); | ||||
| } | } | ||||
| @@ -928,12 +921,12 @@ void API_NS(point_scalarmul) ( | |||||
| for (; i>=0; i-=WINDOW) { | for (; i>=0; i-=WINDOW) { | ||||
| /* Fetch another block of bits */ | /* Fetch another block of bits */ | ||||
| decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||||
| word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS); | |||||
| if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | ||||
| bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
| } | } | ||||
| bits &= WINDOW_MASK; | bits &= WINDOW_MASK; | ||||
| decaf_word_t inv = (bits>>(WINDOW-1))-1; | |||||
| mask_t inv = (bits>>(WINDOW-1))-1; | |||||
| bits ^= inv; | bits ^= inv; | ||||
| /* Add in from table. Compute t only on last iteration. */ | /* Add in from table. Compute t only on last iteration. */ | ||||
| @@ -993,7 +986,7 @@ void API_NS(point_double_scalarmul) ( | |||||
| for (; i>=0; i-=WINDOW) { | for (; i>=0; i-=WINDOW) { | ||||
| /* Fetch another block of bits */ | /* Fetch another block of bits */ | ||||
| decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
| word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
| bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | ||||
| if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | ||||
| bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
| @@ -1001,8 +994,8 @@ void API_NS(point_double_scalarmul) ( | |||||
| } | } | ||||
| bits1 &= WINDOW_MASK; | bits1 &= WINDOW_MASK; | ||||
| bits2 &= WINDOW_MASK; | bits2 &= WINDOW_MASK; | ||||
| decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
| decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
| mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
| mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
| bits1 ^= inv1; | bits1 ^= inv1; | ||||
| bits2 ^= inv2; | bits2 ^= inv2; | ||||
| @@ -1079,16 +1072,16 @@ void API_NS(point_dual_scalarmul) ( | |||||
| } | } | ||||
| /* Fetch another block of bits */ | /* Fetch another block of bits */ | ||||
| decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
| bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||||
| word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS), | |||||
| bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS); | |||||
| if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) { | ||||
| bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
| bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS)); | ||||
| } | } | ||||
| bits1 &= WINDOW_MASK; | bits1 &= WINDOW_MASK; | ||||
| bits2 &= WINDOW_MASK; | bits2 &= WINDOW_MASK; | ||||
| decaf_word_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
| decaf_word_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
| mask_t inv1 = (bits1>>(WINDOW-1))-1; | |||||
| mask_t inv2 = (bits2>>(WINDOW-1))-1; | |||||
| bits1 ^= inv1; | bits1 ^= inv1; | ||||
| bits2 ^= inv2; | bits2 ^= inv2; | ||||
| @@ -66,7 +66,7 @@ void gf_strong_reduce (gf inout); | |||||
| void gf_add (gf out, const gf a, const gf b); | void gf_add (gf out, const gf a, const gf b); | ||||
| void gf_sub (gf out, const gf a, const gf b); | void gf_sub (gf out, const gf a, const gf b); | ||||
| void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); | void gf_mul (gf_s *__restrict__ out, const gf a, const gf b); | ||||
| void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b); | |||||
| void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b); | |||||
| void gf_sqr (gf_s *__restrict__ out, const gf a); | void gf_sqr (gf_s *__restrict__ out, const gf a); | ||||
| void gf_serialize (uint8_t *serial, const gf x); | void gf_serialize (uint8_t *serial, const gf x); | ||||
| void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ | void gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0 */ | ||||
| @@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| c[1] += accum; | c[1] += accum; | ||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); | const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1); | ||||
| uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; | uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi; | ||||
| uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
| @@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| c[1] += accum; | c[1] += accum; | ||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
| int i; | int i; | ||||
| @@ -4,6 +4,7 @@ | |||||
| #include "f_field.h" | #include "f_field.h" | ||||
| /** Requires: input limbs < 9*2^51 */ | |||||
| void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | ||||
| const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | ||||
| uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
| @@ -65,6 +66,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| ai = a[4]; | ai = a[4]; | ||||
| mac_rm(&accum1, ai, &b[0]); | mac_rm(&accum1, ai, &b[0]); | ||||
| /* Here accum1 < 5*(9*2^51)^2 */ | |||||
| c[3] = accum0 & mask; | c[3] = accum0 & mask; | ||||
| accum1 += shrld(accum0, 51); | accum1 += shrld(accum0, 51); | ||||
| @@ -72,13 +74,16 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
| * = 2^(-13 + <13) | * = 2^(-13 + <13) | ||||
| * PERF: good enough to fit into uint64_t? | |||||
| * PERF: good enough to fit into uint64_t. | |||||
| */ | */ | ||||
| uint64_t a1 = shrld(accum1,51); | uint64_t a1 = shrld(accum1,51); | ||||
| accum1 = (__uint128_t)a1 * 19 + c0; | |||||
| /* Here a1 < (5*(9*2^51)^2 + small) >> 51 = 405 * 2^51 + small | |||||
| * a1 * 19 + c0 < (405*19+1)*2^51 + small < 2^13 * 2^51. | |||||
| */ | |||||
| accum1 = a1 * 19 + c0; | |||||
| c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
| c[1] = c1 + shrld(accum1,51); | |||||
| c[1] = c1 + (accum1>>51); | |||||
| } | } | ||||
| void gf_sqr (gf_s *__restrict__ cs, const gf as) { | void gf_sqr (gf_s *__restrict__ cs, const gf as) { | ||||
| @@ -132,16 +137,15 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||||
| /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64 | ||||
| * = 2^(-13 + <13) | * = 2^(-13 + <13) | ||||
| * PERF: good enough to fit into uint64_t? | |||||
| */ | */ | ||||
| uint64_t a1 = shrld(accum1,51); | uint64_t a1 = shrld(accum1,51); | ||||
| accum1 = (__uint128_t)a1 * 19 + c0; | |||||
| accum1 = a1 * 19 + c0; | |||||
| c[0] = accum1 & mask; | c[0] = accum1 & mask; | ||||
| c[1] = c1 + shrld(accum1,51); | |||||
| c[1] = c1 + (accum1>>51); | |||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | ||||
| uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
| @@ -164,9 +168,9 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| mac_rm(&accum, b, &a[4]); | mac_rm(&accum, b, &a[4]); | ||||
| c[4] = accum & mask; | c[4] = accum & mask; | ||||
| accum = shrld(accum,51); | |||||
| accum = accum * 19 + c0; | |||||
| uint64_t a1 = shrld(accum,51); | |||||
| a1 = a1*19+c0; | |||||
| c[0] = accum & mask; | |||||
| c[1] = c1 + shrld(accum,51); | |||||
| c[0] = a1 & mask; | |||||
| c[1] = c1 + (a1>>51); | |||||
| } | } | ||||
| @@ -60,8 +60,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| c[1] += ((uint32_t)(accum1)); | c[1] += ((uint32_t)(accum1)); | ||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| assert(b<1<<28); | |||||
| const uint32_t *a = as->limb; | const uint32_t *a = as->limb; | ||||
| uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
| @@ -71,20 +71,15 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| int i; | int i; | ||||
| accum0 = widemul(blo, a[0]); | |||||
| accum8 = widemul(blo, a[8]); | |||||
| accum0 += widemul(bhi, a[15]); | |||||
| accum8 += widemul(bhi, a[15] + a[7]); | |||||
| accum0 = widemul(b, a[0]); | |||||
| accum8 = widemul(b, a[8]); | |||||
| c[0] = accum0 & mask; accum0 >>= 28; | c[0] = accum0 & mask; accum0 >>= 28; | ||||
| c[8] = accum8 & mask; accum8 >>= 28; | c[8] = accum8 & mask; accum8 >>= 28; | ||||
| for (i=1; i<8; i++) { | for (i=1; i<8; i++) { | ||||
| accum0 += widemul(blo, a[i]); | |||||
| accum8 += widemul(blo, a[i+8]); | |||||
| accum0 += widemul(bhi, a[i-1]); | |||||
| accum8 += widemul(bhi, a[i+7]); | |||||
| accum0 += widemul(b, a[i]); | |||||
| accum8 += widemul(b, a[i+8]); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -43,8 +43,8 @@ void gf_bias (gf a, int amt) { | |||||
| } | } | ||||
| void gf_weak_reduce (gf a) { | void gf_weak_reduce (gf a) { | ||||
| uint64_t mask = (1ull<<28) - 1; | |||||
| uint64_t tmp = a->limb[15] >> 28; | |||||
| uint32_t mask = (1ull<<28) - 1; | |||||
| uint32_t tmp = a->limb[15] >> 28; | |||||
| a->limb[8] += tmp; | a->limb[8] += tmp; | ||||
| for (unsigned int i=15; i>0; i--) { | for (unsigned int i=15; i>0; i--) { | ||||
| a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); | ||||
| @@ -724,10 +724,10 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { | |||||
| void gf_mulw ( | void gf_mulw ( | ||||
| gf_s *__restrict__ cs, | gf_s *__restrict__ cs, | ||||
| const gf as, | const gf as, | ||||
| uint64_t b | |||||
| uint32_t b | |||||
| ) { | ) { | ||||
| uint32_t mask = (1ull<<28)-1; | uint32_t mask = (1ull<<28)-1; | ||||
| const uint32_t bhi = b>>28, blo = b & mask; | |||||
| assert(b <= mask); | |||||
| const uint32_t *a = as->limb; | const uint32_t *a = as->limb; | ||||
| uint32_t *c = cs->limb; | uint32_t *c = cs->limb; | ||||
| @@ -737,11 +737,9 @@ void gf_mulw ( | |||||
| int i; | int i; | ||||
| uint32_t c0, c8, n0, n8; | uint32_t c0, c8, n0, n8; | ||||
| accum0 = widemul(bhi, a[15]); | |||||
| accum8 = widemul(bhi, a[15] + a[7]); | |||||
| c0 = a[0]; c8 = a[8]; | c0 = a[0]; c8 = a[8]; | ||||
| smlal(&accum0, blo, c0); | |||||
| smlal(&accum8, blo, c8); | |||||
| accum0 = widemul(b, c0); | |||||
| accum8 = widemul(b, c8); | |||||
| c[0] = accum0 & mask; accum0 >>= 28; | c[0] = accum0 & mask; accum0 >>= 28; | ||||
| c[8] = accum8 & mask; accum8 >>= 28; | c[8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -749,10 +747,8 @@ void gf_mulw ( | |||||
| i=1; | i=1; | ||||
| { | { | ||||
| n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
| smlal(&accum0, bhi, c0); | |||||
| smlal(&accum8, bhi, c8); | |||||
| smlal(&accum0, blo, n0); | |||||
| smlal(&accum8, blo, n8); | |||||
| smlal(&accum0, b, n0); | |||||
| smlal(&accum8, b, n8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -760,10 +756,8 @@ void gf_mulw ( | |||||
| } | } | ||||
| { | { | ||||
| c0 = a[i]; c8 = a[i+8]; | c0 = a[i]; c8 = a[i+8]; | ||||
| smlal(&accum0, bhi, n0); | |||||
| smlal(&accum8, bhi, n8); | |||||
| smlal(&accum0, blo, c0); | |||||
| smlal(&accum8, blo, c8); | |||||
| smlal(&accum0, b, c0); | |||||
| smlal(&accum8, b, c8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -771,10 +765,8 @@ void gf_mulw ( | |||||
| } | } | ||||
| { | { | ||||
| n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
| smlal(&accum0, bhi, c0); | |||||
| smlal(&accum8, bhi, c8); | |||||
| smlal(&accum0, blo, n0); | |||||
| smlal(&accum8, blo, n8); | |||||
| smlal(&accum0, b, n0); | |||||
| smlal(&accum8, b, n8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -782,10 +774,8 @@ void gf_mulw ( | |||||
| } | } | ||||
| { | { | ||||
| c0 = a[i]; c8 = a[i+8]; | c0 = a[i]; c8 = a[i+8]; | ||||
| smlal(&accum0, bhi, n0); | |||||
| smlal(&accum8, bhi, n8); | |||||
| smlal(&accum0, blo, c0); | |||||
| smlal(&accum8, blo, c8); | |||||
| smlal(&accum0, b, c0); | |||||
| smlal(&accum8, b, c8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -793,10 +783,8 @@ void gf_mulw ( | |||||
| } | } | ||||
| { | { | ||||
| n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
| smlal(&accum0, bhi, c0); | |||||
| smlal(&accum8, bhi, c8); | |||||
| smlal(&accum0, blo, n0); | |||||
| smlal(&accum8, blo, n8); | |||||
| smlal(&accum0, b, n0); | |||||
| smlal(&accum8, b, n8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -804,10 +792,8 @@ void gf_mulw ( | |||||
| } | } | ||||
| { | { | ||||
| c0 = a[i]; c8 = a[i+8]; | c0 = a[i]; c8 = a[i+8]; | ||||
| smlal(&accum0, bhi, n0); | |||||
| smlal(&accum8, bhi, n8); | |||||
| smlal(&accum0, blo, c0); | |||||
| smlal(&accum8, blo, c8); | |||||
| smlal(&accum0, b, c0); | |||||
| smlal(&accum8, b, c8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -815,10 +801,8 @@ void gf_mulw ( | |||||
| } | } | ||||
| { | { | ||||
| n0 = a[i]; n8 = a[i+8]; | n0 = a[i]; n8 = a[i+8]; | ||||
| smlal(&accum0, bhi, c0); | |||||
| smlal(&accum8, bhi, c8); | |||||
| smlal(&accum0, blo, n0); | |||||
| smlal(&accum8, blo, n8); | |||||
| smlal(&accum0, b, n0); | |||||
| smlal(&accum8, b, n8); | |||||
| c[i] = accum0 & mask; accum0 >>= 28; | c[i] = accum0 & mask; accum0 >>= 28; | ||||
| c[i+8] = accum8 & mask; accum8 >>= 28; | c[i+8] = accum8 & mask; accum8 >>= 28; | ||||
| @@ -549,20 +549,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) { | |||||
| ); | ); | ||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; | ||||
| assert(b<(1<<28)); | |||||
| uint64x2_t accum; | uint64x2_t accum; | ||||
| const uint32x2_t *va = (const uint32x2_t *) as->limb; | const uint32x2_t *va = (const uint32x2_t *) as->limb; | ||||
| uint32x2_t *vo = (uint32x2_t *) cs->limb; | uint32x2_t *vo = (uint32x2_t *) cs->limb; | ||||
| uint32x2_t vc, vn; | uint32x2_t vc, vn; | ||||
| uint32x2_t vb = {b & ((1<<28)-1), b>>28}; | |||||
| accum = vmull_lane_u32(va[7], vb, 1); | |||||
| accum = xx_vaddup_u64(vrev128_u64(accum)); | |||||
| uint32x2_t vb = {b, 0}; | |||||
| vc = va[0]; | vc = va[0]; | ||||
| accum = vmlal_lane_u32(accum, vc, vb, 0); | |||||
| accum = vmull_lane_u32(accum, vc, vb, 0); | |||||
| vo[0] = vmovn_u64(accum) & vmask; | vo[0] = vmovn_u64(accum) & vmask; | ||||
| accum = vshrq_n_u64(accum,28); | accum = vshrq_n_u64(accum,28); | ||||
| @@ -579,7 +577,6 @@ void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| int i; | int i; | ||||
| for (i=1; i<8; i++) { | for (i=1; i<8; i++) { | ||||
| vn = va[i]; | vn = va[i]; | ||||
| accum = vmlal_lane_u32(accum, vc, vb, 1); | |||||
| accum = vmlal_lane_u32(accum, vn, vb, 0); | accum = vmlal_lane_u32(accum, vn, vb, 0); | ||||
| vo[i] = vmovn_u64(accum) & vmask; | vo[i] = vmovn_u64(accum) & vmask; | ||||
| accum = vshrq_n_u64(accum,28); | accum = vshrq_n_u64(accum,28); | ||||
| @@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| c[1] += ((uint64_t)(accum1)); | c[1] += ((uint64_t)(accum1)); | ||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
| uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
| @@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { | |||||
| c[0] += ((uint64_t)(accum1)); | c[0] += ((uint64_t)(accum1)); | ||||
| } | } | ||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { | |||||
| void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { | |||||
| const uint64_t *a = as->limb; | const uint64_t *a = as->limb; | ||||
| uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
| @@ -1,6 +1,6 @@ | |||||
| /** | /** | ||||
| * @cond internal | * @cond internal | ||||
| * @file decaf_crypto.c | |||||
| * @file per_field.c | |||||
| * @copyright | * @copyright | ||||
| * Copyright (c) 2015-2016 Cryptography Research, Inc. \n | * Copyright (c) 2015-2016 Cryptography Research, Inc. \n | ||||
| * Released under the MIT License. See LICENSE.txt for license information. | * Released under the MIT License. See LICENSE.txt for license information. | ||||