diff --git a/src/goldilocks.c b/src/goldilocks.c index 11ccdfb..c3a4ca3 100644 --- a/src/goldilocks.c +++ b/src/goldilocks.c @@ -34,6 +34,27 @@ #define GOLDI_DIVERSIFY_BYTES 8 + +#if FIELD_BYTES <= SHA512_OUTPUT_BYTES +#define FIELD_HASH_BYTES SHA512_OUTPUT_BYTES +#define field_hash_final sha512_final +#else +#define FIELD_HASH_BYTES (SHA512_OUTPUT_BYTES * ((FIELD_BYTES-1)/SHA512_OUTPUT_BYTES + 1)) +static inline void field_hash_final ( + struct sha512_ctx_t *ctx, + unsigned char out[FIELD_HASH_BYTES] +) { + /* SHA PRNG I guess? I really should have used SHAKE */ + int i; + for (i=0; i<= (FIELD_BYTES-1) / SHA512_OUTPUT_BYTES; i++) { + if (i) + sha512_update(ctx, &out[(i-1)*SHA512_OUTPUT_BYTES], SHA512_OUTPUT_BYTES); + sha512_final(ctx, &out[i*SHA512_OUTPUT_BYTES]); + } +} +#endif + + /* These are just unique identifiers */ static const char *G_INITING = "initializing"; static const char *G_INITED = "initialized"; @@ -135,7 +156,7 @@ goldilocks_derive_private_key ( memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES); - unsigned char skb[SHA512_OUTPUT_BYTES]; + unsigned char skb[FIELD_HASH_BYTES]; word_t sk[GOLDI_FIELD_WORDS]; assert(sizeof(skb) >= sizeof(sk)); @@ -146,9 +167,9 @@ goldilocks_derive_private_key ( sha512_init(&ctx); sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES); sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES); - sha512_final(&ctx, (unsigned char *)skb); + field_hash_final(&ctx, (unsigned char *)skb); - barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &curve_prime_order); + barrett_deserialize_and_reduce(sk, skb, sizeof(skb), &curve_prime_order); barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES); scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base); @@ -316,13 +337,13 @@ goldilocks_derive_challenge( uint64_t message_len ) { /* challenge = H(pk, [nonceG], message). */ - unsigned char sha_out[SHA512_OUTPUT_BYTES]; + unsigned char sha_out[FIELD_HASH_BYTES]; struct sha512_ctx_t ctx; sha512_init(&ctx); sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES); sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES); sha512_update(&ctx, message, message_len); - sha512_final(&ctx, sha_out); + field_hash_final(&ctx, sha_out); barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order); } @@ -346,7 +367,7 @@ goldilocks_sign ( } /* Derive a nonce. TODO: use HMAC. FUTURE: factor. */ - unsigned char sha_out[SHA512_OUTPUT_BYTES]; + unsigned char sha_out[FIELD_HASH_BYTES]; word_t tk[GOLDI_FIELD_WORDS]; struct sha512_ctx_t ctx; sha512_init(&ctx); @@ -354,8 +375,8 @@ goldilocks_sign ( sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); sha512_update(&ctx, message, message_len); sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES); - sha512_final(&ctx, sha_out); - barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &curve_prime_order); + field_hash_final(&ctx, sha_out); + barrett_deserialize_and_reduce(tk, sha_out, sizeof(sha_out), &curve_prime_order); /* 4[nonce]G */ uint8_t signature_tmp[GOLDI_FIELD_BYTES]; diff --git a/src/include/constant_time.h b/src/include/constant_time.h index 686a508..405c2f5 100644 --- a/src/include/constant_time.h +++ b/src/include/constant_time.h @@ -127,7 +127,9 @@ constant_time_cond_swap ( /** * @brief Constant-time equivalent of memcpy(out, table + elem_bytes*idx, elem_bytes); * - * The table must be at least as aligned as elem_bytes. The output must be vector aligned. + * The table must be at least as aligned as elem_bytes. The output must be word aligned, + * and if the input size is vector aligned it must also be vector aligned. + * * The table and output must not alias. */ static __inline__ void @@ -151,8 +153,9 @@ constant_time_lookup ( big_register_t br_mask = br_is_zero(big_i); for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { if (elem_bytes % sizeof(big_register_t)) { - /* input unaligned, output aligned */ - *(big_register_t *)(out+k) |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; + /* unaligned */ + ((unaligned_br_t *)(out+k))->unaligned + |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; } else { /* aligned */ *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]); diff --git a/src/include/word.h b/src/include/word.h index 644edc7..31af42c 100644 --- a/src/include/word.h +++ b/src/include/word.h @@ -102,6 +102,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); #endif #if __AVX2__ + #define VECTOR_ALIGNED __attribute__((aligned(32))) typedef uint32x8_t big_register_t; typedef uint64x4_t uint64xn_t; typedef uint32x8_t uint32xn_t; @@ -113,6 +114,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return ret; } #elif __SSE2__ + #define VECTOR_ALIGNED __attribute__((aligned(16))) typedef uint32x4_t big_register_t; typedef uint64x2_t uint64xn_t; typedef uint32x4_t uint32xn_t; @@ -124,6 +126,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return ret; } #elif __ARM_NEON__ + #define VECTOR_ALIGNED __attribute__((aligned(16))) typedef uint32x4_t big_register_t; typedef uint64x2_t uint64xn_t; typedef uint32x4_t uint32xn_t; @@ -132,6 +135,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return vdupq_n_u32(x); } #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__ + #define VECTOR_ALIGNED __attribute__((aligned(8))) typedef uint64_t big_register_t, uint64xn_t; typedef uint32_t uint32xn_t; @@ -140,6 +144,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32))); return (big_register_t)x; } #else + #define VECTOR_ALIGNED __attribute__((aligned(4))) typedef uint64_t uint64xn_t; typedef uint32_t uint32xn_t; typedef uint32_t big_register_t; diff --git a/src/p521/arch_ref64/arch_config.h b/src/p521/arch_ref64/arch_config.h new file mode 100644 index 0000000..58758cc --- /dev/null +++ b/src/p521/arch_ref64/arch_config.h @@ -0,0 +1 @@ +#define WORD_BITS 64 diff --git a/src/p521/arch_ref64/p521.c b/src/p521/arch_ref64/p521.c new file mode 100644 index 0000000..8238699 --- /dev/null +++ b/src/p521/arch_ref64/p521.c @@ -0,0 +1,417 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "p521.h" + +static __inline__ __uint128_t widemul( + const uint64_t a, + const uint64_t b +) { + return ((__uint128_t)a) * ((__uint128_t)b); +} + +static __inline__ uint64_t is_zero(uint64_t a) { + /* let's hope the compiler isn't clever enough to optimize this. */ + return (((__uint128_t)a)-1)>>64; +} + +void +p521_mul ( + p521_t *__restrict__ cs, + const p521_t *as, + const p521_t *bs +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb, *b = bs->limb; + __uint128_t accum0, accum1; + + accum0 = widemul(2*a[8], b[8]); + accum1 = widemul(a[0], b[7]); + accum0 += widemul(a[1], b[6]); + accum1 += widemul(a[2], b[5]); + accum0 += widemul(a[3], b[4]); + accum1 += widemul(a[4], b[3]); + accum0 += widemul(a[5], b[2]); + accum1 += widemul(a[6], b[1]); + accum0 += widemul(a[7], b[0]); + accum1 += accum0; + c[7] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[8-0]); + accum0 += widemul(a[1], b[8-1]); + accum1 += widemul(a[2], b[8-2]); + accum0 += widemul(a[3], b[8-3]); + accum1 += widemul(a[4], b[8-4]); + accum0 += widemul(a[5], b[8-5]); + accum1 += widemul(a[6], b[8-6]); + accum0 += widemul(a[7], b[8-7]); + accum1 += widemul(a[8], b[8-8]); + accum1 += accum0; + c[8] = accum1 & ((1ull<<57)-1); + accum1 >>= 57; + + accum0 = 0; + accum0 += widemul(a[1], b[0+9-1]); + accum0 += widemul(a[2], b[0+9-2]); + accum0 += widemul(a[3], b[0+9-3]); + accum0 += widemul(a[4], b[0+9-4]); + accum1 += widemul(a[0], b[0-0]); + accum0 += widemul(a[5], b[0+9-5]); + accum0 += widemul(a[6], b[0+9-6]); + accum0 += widemul(a[7], b[0+9-7]); + accum0 += widemul(a[8], b[0+9-8]); + accum1 += accum0 << 1; + c[0] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[2], b[1+9-2]); + accum0 += widemul(a[3], b[1+9-3]); + accum1 += widemul(a[0], b[1-0]); + accum0 += widemul(a[4], b[1+9-4]); + accum0 += widemul(a[5], b[1+9-5]); + accum1 += widemul(a[1], b[1-1]); + accum0 += widemul(a[6], b[1+9-6]); + accum0 += widemul(a[7], b[1+9-7]); + accum0 += widemul(a[8], b[1+9-8]); + accum1 += accum0 << 1; + c[1] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[3], b[2+9-3]); + accum1 += widemul(a[0], b[2-0]); + accum0 += widemul(a[4], b[2+9-4]); + accum0 += widemul(a[5], b[2+9-5]); + accum1 += widemul(a[1], b[2-1]); + accum0 += widemul(a[6], b[2+9-6]); + accum0 += widemul(a[7], b[2+9-7]); + accum1 += widemul(a[2], b[2-2]); + accum0 += widemul(a[8], b[2+9-8]); + accum1 += accum0 << 1; + c[2] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[4], b[3+9-4]); + accum1 += widemul(a[0], b[3-0]); + accum0 += widemul(a[5], b[3+9-5]); + accum1 += widemul(a[1], b[3-1]); + accum0 += widemul(a[6], b[3+9-6]); + accum1 += widemul(a[2], b[3-2]); + accum0 += widemul(a[7], b[3+9-7]); + accum1 += widemul(a[3], b[3-3]); + accum0 += widemul(a[8], b[3+9-8]); + accum1 += accum0 << 1; + c[3] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[4-0]); + accum0 += widemul(a[5], b[4+9-5]); + accum1 += widemul(a[1], b[4-1]); + accum0 += widemul(a[6], b[4+9-6]); + accum1 += widemul(a[2], b[4-2]); + accum0 += widemul(a[7], b[4+9-7]); + accum1 += widemul(a[3], b[4-3]); + accum0 += widemul(a[8], b[4+9-8]); + accum1 += widemul(a[4], b[4-4]); + accum1 += accum0 << 1; + c[4] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[5-0]); + accum0 += widemul(a[6], b[5+9-6]); + accum1 += widemul(a[1], b[5-1]); + accum1 += widemul(a[2], b[5-2]); + accum0 += widemul(a[7], b[5+9-7]); + accum1 += widemul(a[3], b[5-3]); + accum1 += widemul(a[4], b[5-4]); + accum0 += widemul(a[8], b[5+9-8]); + accum1 += widemul(a[5], b[5-5]); + accum1 += accum0 << 1; + c[5] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[0], b[6-0]); + accum1 += widemul(a[1], b[6-1]); + accum0 += widemul(a[7], b[6+9-7]); + accum1 += widemul(a[2], b[6-2]); + accum1 += widemul(a[3], b[6-3]); + accum1 += widemul(a[4], b[6-4]); + accum0 += widemul(a[8], b[6+9-8]); + accum1 += widemul(a[5], b[6-5]); + accum1 += widemul(a[6], b[6-6]); + accum1 += accum0 << 1; + c[6] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum1 += c[7]; + c[7] = accum1 & ((1ull<<58)-1); + + c[8] += accum1 >> 58; +} + +void +p521_mulw ( + p521_t *__restrict__ cs, + const p521_t *as, + uint64_t b +) { + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0 = 0, accum3 = 0, accum6 = 0; + uint64_t mask = (1ull<<58) - 1; + + int i; + for (i=0; i<3; i++) { + accum0 += widemul(b, a[i]); + accum3 += widemul(b, a[i+3]); + accum6 += widemul(b, a[i+6]); + c[i] = accum0 & mask; accum0 >>= 58; + c[i+3] = accum3 & mask; accum3 >>= 58; + if (i==2) { + c[i+6] = accum6 & (mask>>1); accum6 >>= 57; + } else { + c[i+6] = accum6 & mask; accum6 >>= 58; + } + } + + accum0 += c[3]; + c[3] = accum0 & mask; + c[4] += accum0 >> 58; + + accum3 += c[6]; + c[6] = accum3 & mask; + c[7] += accum3 >> 58; + + accum6 += c[0]; + c[0] = accum6 & mask; + c[1] += accum6 >> 58; +} + +void +p521_sqr ( + p521_t *__restrict__ cs, + const p521_t *as +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb; + __uint128_t accum0, accum1; + + accum0 = widemul(a[8], a[8]); + accum1 = widemul(a[0], a[7]); + accum0 += widemul(a[1], a[6]); + accum1 += widemul(a[2], a[5]); + accum0 += widemul(a[3], a[4]); + accum1 += accum0; + c[7] = 2 * (accum1 & ((1ull<<57)-1)); + accum1 >>= 57; + + accum0 = 0; + accum0 = 0; + accum1 += widemul(a[4], a[4]); + accum0 += widemul(a[1], a[7]); + accum1 += widemul(2*a[2], a[6]); + accum0 += widemul(a[3], a[5]); + accum1 += widemul(2*a[0], a[8]); + accum1 += 2*accum0; + c[8] = accum1 & ((1ull<<57)-1); + accum1 >>= 57; + + accum0 = 0; + accum1 += widemul(a[0], a[0]); + accum0 += widemul(a[1], a[8]); + accum0 += widemul(a[2], a[7]); + accum0 += widemul(a[3], a[6]); + accum0 += widemul(a[4], a[5]); + accum1 += accum0 << 2; + c[0] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[2], a[8]); + accum0 += widemul(a[3], a[7]); + accum0 += widemul(a[4], a[6]); + accum0 <<= 1; + accum0 += widemul(a[5], a[5]); + accum0 += widemul(a[0], a[1]); + accum1 += accum0 << 1; + c[1] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[1], a[1]); + + accum0 += widemul(a[3], a[8]); + accum0 += widemul(a[4], a[7]); + accum0 += widemul(a[5], a[6]); + accum0 <<= 1; + accum0 += widemul(a[0], a[2]); + accum1 += accum0 << 1; + c[2] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[6], a[6]); + accum0 += widemul(2*a[5], a[7]); + accum0 += widemul(2*a[4], a[8]); + accum0 += widemul(a[0], a[3]); + accum0 += widemul(a[1], a[2]); + accum1 += accum0 << 1; + c[3] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(a[6], a[7]); + accum0 += widemul(a[5], a[8]); + accum0 <<= 1; + accum1 += widemul(a[2], a[2]); + accum0 += widemul(a[0], a[4]); + accum0 += widemul(a[1], a[3]); + accum1 += accum0 << 1; + c[4] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum0 += widemul(2*a[6], a[8]); + accum0 += widemul(a[7], a[7]); + accum0 += widemul(a[0], a[5]); + accum0 += widemul(a[1], a[4]); + accum0 += widemul(a[2], a[3]); + accum1 += accum0 << 1; + c[5] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum0 = 0; + accum1 += widemul(a[3], a[3]); + accum0 += widemul(a[0], a[6]); + accum0 += widemul(a[1], a[5]); + accum0 += widemul(2*a[7], a[8]); + accum0 += widemul(a[2], a[4]); + accum1 += accum0 << 1; + c[6] = accum1 & ((1ull<<58)-1); + accum1 >>= 58; + + accum1 += c[7]; + c[7] = accum1 & ((1ull<<58)-1); + + c[8] += accum1 >> 58; +} + +void +p521_strong_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; + + /* first, clear high */ + __int128_t scarry = a->limb[8]>>57; + a->limb[8] &= mask2; + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + int i; + for (i=0; i<9; i++) { + scarry = scarry + a->limb[i] - ((i==8) ? mask2 : mask); + a->limb[i] = scarry & ((i==8) ? mask2 : mask); + scarry >>= (i==8) ? 57 : 58; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^521 + * so let's add back in p. will carry back off the top for 2^521. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + uint64_t scarry_mask = scarry & mask; + __uint128_t carry = 0; + + /* add it back */ + for (i=0; i<9; i++) { + carry = carry + a->limb[i] + ((i==8)?(scarry_mask>>1):scarry_mask); + a->limb[i] = carry & ((i==8) ? mask>>1 : mask); + carry >>= (i==8) ? 57 : 58; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p521_is_zero ( + const struct p521_t *a +) { + struct p521_t b; + p521_copy(&b,a); + p521_strong_reduce(&b); + + uint64_t any = 0; + int i; + for (i=0; i<9; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +) { + int i,k=0; + p521_t red; + p521_copy(&red, x); + p521_strong_reduce(&red); + + uint64_t r=0; + int bits = 0; + for (i=0; i<9; i++) { + r |= red.limb[i] << bits; + for (bits += 58; bits >= 8; bits -= 8) { + serial[k++] = r; + r >>= 8; + } + assert(bits <= 6); + } + assert(bits); + serial[k++] = r; +} + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[66] +) { + int i,k=0,bits=0; + __uint128_t out = 0; + uint64_t mask = (1ull<<58)-1; + for (i=0; i<9; i++) { + out >>= 58; + for (; bits<58; bits+=8) { + out |= ((__uint128_t)serial[k++])<limb[i] = out & mask; + bits -= 58; + } + + /* Check for reduction. First, high has to be < 2^57 */ + mask_t good = is_zero(out>>57); + + uint64_t and = -1ull; + for (i=0; i<8; i++) { + and &= x->limb[i]; + } + and &= (2*out+1); + good &= is_zero((and+1)>>58); + + return good; +} diff --git a/src/p521/arch_ref64/p521.h b/src/p521/arch_ref64/p521.h new file mode 100644 index 0000000..c4dbf69 --- /dev/null +++ b/src/p521/arch_ref64/p521.h @@ -0,0 +1,244 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P521_H__ +#define __P521_H__ 1 + +#include +#include +#include + +#include "word.h" + +typedef struct p521_t { + uint64_t limb[9]; +} p521_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p521_set_ui ( + p521_t *out, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_neg ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_addw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_subw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_copy ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_weak_reduce ( + p521_t *inout +) __attribute__((unused)); + +void +p521_strong_reduce ( + p521_t *inout +); + +mask_t +p521_is_zero ( + const p521_t *in +); + +static __inline__ void +p521_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +static __inline__ void +p521_really_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +void +p521_mul ( + p521_t *__restrict__ out, + const p521_t *a, + const p521_t *b +); + +void +p521_mulw ( + p521_t *__restrict__ out, + const p521_t *a, + uint64_t b +); + +void +p521_sqr ( + p521_t *__restrict__ out, + const p521_t *a +); + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +); + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[66] +); + +/* -------------- Inline functions begin here -------------- */ + +void +p521_set_ui ( + p521_t *out, + uint64_t x +) { + int i; + out->limb[0] = x; + for (i=1; i<9; i++) { + out->limb[i] = 0; + } +} + +void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1); + } + p521_weak_reduce(out); +} + +void +p521_neg ( + struct p521_t *out, + const p521_t *a +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_addw ( + p521_t *a, + uint64_t x +) { + a->limb[0] += x; + a->limb[1] += a->limb[0]>>58; + a->limb[0] &= (1ull<<58)-1; +} + +void +p521_subw ( + p521_t *a, + uint64_t x +) { + a->limb[0] -= x; + p521_really_bias(a, 1); + p521_weak_reduce(a); +} + +void +p521_copy ( + p521_t *out, + const p521_t *a +) { + memcpy(out,a,sizeof(*a)); +} + +void +p521_really_bias ( + p521_t *a, + int amt +) { + uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt; + int i; + for (i=0; i<9; i++) { + a->limb[i] += (i==8) ? co2 : co1; + } +} + +void +p521_bias ( + p521_t *a, + int amt +) { + (void) a; + (void) amt; +} + +void +p521_weak_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58) - 1; + uint64_t tmp = a->limb[8] >> 57; + int i; + for (i=8; i>0; i--) { + a->limb[i] = (a->limb[i] & ((i==8) ? mask>>1 : mask)) + (a->limb[i-1]>>58); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P521_H__ */ diff --git a/src/p521/arch_x86_64/arch_config.h b/src/p521/arch_x86_64/arch_config.h new file mode 100644 index 0000000..58758cc --- /dev/null +++ b/src/p521/arch_x86_64/arch_config.h @@ -0,0 +1 @@ +#define WORD_BITS 64 diff --git a/src/p521/arch_x86_64/p521.c b/src/p521/arch_x86_64/p521.c new file mode 100644 index 0000000..55d82fc --- /dev/null +++ b/src/p521/arch_x86_64/p521.c @@ -0,0 +1,439 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ + +#include "p521.h" + +typedef uint64x4_t uint64x3_t; /* fit it in a vector register */ +static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 }; + +typedef struct { + uint64x3_t lo, hi; +} hexad_t; + +/* Currently requires CLANG. Sorry. */ +static inline uint64x3_t timesW (uint64x3_t u) { + return u.zxyw + u.zwww; +} + +/* Store three vectors. Currently requries AVX2 (TODO: remove) */ +static const uint64x4_t ls_mask_3 = { -1ull, -1ull, -1ull, 0 }; +static void store3 (uint64_t *x, uint64x3_t v) { + _mm256_maskstore_epi64((long long *) x, ls_mask_3, v); +} + +static __inline__ uint64_t is_zero(uint64_t a) { + /* let's hope the compiler isn't clever enough to optimize this. */ + return (((__uint128_t)a)-1)>>64; +} + +static __inline__ __uint128_t widemul( + const uint64_t a, + const uint64_t b +) { + return ((__uint128_t)a) * ((__uint128_t)b); +} + +static inline __uint128_t widemulu(const uint64_t a, const uint64_t b) { + return ((__uint128_t)(a)) * b; +} + +static inline __int128_t widemuls(const int64_t a, const int64_t b) { + return ((__int128_t)(a)) * b; +} + +/* This is a trick to prevent terrible register allocation by hiding things from clang's optimizer */ +static inline uint64_t opacify(uint64_t x) { + __asm__ volatile("" : "+r"(x)); + return x; +} + +static inline void hexad_mul ( + hexad_t *hex, + const uint64_t *a, + const uint64_t *b +) { + __uint128_t xu, xv, xw; + + uint64_t tmp = opacify(a[2]); + xw = widemulu(tmp, b[0]); + tmp <<= 1; + xu = widemulu(tmp, b[1]); + xv = widemulu(tmp, b[2]); + + tmp = opacify(a[1]); + xw += widemulu(tmp, b[1]); + xv += widemulu(tmp, b[0]); + tmp <<= 1; + xu += widemulu(tmp, b[2]); + + tmp = opacify(a[0]); + xu += widemulu(tmp, b[0]); + xv += widemulu(tmp, b[1]); + xw += widemulu(tmp, b[2]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + +static inline void hexad_mul_signed ( + hexad_t *hex, + const int64_t *a, + const int64_t *b +) { + __int128_t xu, xv, xw; + + int64_t tmp = opacify(a[2]); + xw = widemuls(tmp, b[0]); + tmp <<= 1; + xu = widemuls(tmp, b[1]); + xv = widemuls(tmp, b[2]); + + tmp = opacify(a[1]); + xw += widemuls(tmp, b[1]); + xv += widemuls(tmp, b[0]); + tmp <<= 1; + xu += widemuls(tmp, b[2]); + + tmp = opacify(a[0]); + xu += widemuls(tmp, b[0]); + xv += widemuls(tmp, b[1]); + xw += widemuls(tmp, b[2]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + +static inline void hexad_sqr ( + hexad_t *hex, + const uint64_t *a +) { + __uint128_t xu, xv, xw; + + int64_t tmp = a[2]; + tmp <<= 1; + xw = widemulu(tmp, a[0]); + xv = widemulu(tmp, a[2]); + tmp <<= 1; + xu = widemulu(tmp, a[1]); + + tmp = a[1]; + xw += widemulu(tmp, a[1]); + tmp <<= 1; + xv += widemulu(tmp, a[0]); + + tmp = a[0]; + xu += widemulu(tmp, a[0]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + +static inline void hexad_sqr_signed ( + hexad_t *hex, + const int64_t *a +) { + __uint128_t xu, xv, xw; + + int64_t tmp = a[2]; + tmp <<= 1; + xw = widemuls(tmp, a[0]); + xv = widemuls(tmp, a[2]); + tmp <<= 1; + xu = widemuls(tmp, a[1]); + + tmp = a[1]; + xw += widemuls(tmp, a[1]); + tmp <<= 1; + xv += widemuls(tmp, a[0]); + + tmp = a[0]; + xu += widemuls(tmp, a[0]); + + uint64x3_t + lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 }, + hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 }; + + hi = hi<<6 | lo>>58; + lo &= mask58; + + hex->lo = lo; + hex->hi = hi; +} + + + +void +p521_mul ( + p521_t *__restrict__ cs, + const p521_t *as, + const p521_t *bs +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb, *b = bs->limb; + + hexad_t ad, be, cf, abde, bcef, acdf; + hexad_mul(&ad, &a[0], &b[0]); + hexad_mul(&be, &a[3], &b[3]); + hexad_mul(&cf, &a[6], &b[6]); + + uint64_t amt = 32; + uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, + vhi2 = { 0, 0, -amt<<57, 0 }; + + uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo; + + int64_t ta[3], tb[3]; + // it seems to be faster not to vectorize these loops + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+3]; + tb[i] = b[i]-b[i+3]; + } + hexad_mul_signed(&abde,ta,tb); + + for (int i=0; i<3; i++) { + ta[i] = a[i+3]-a[i+6]; + tb[i] = b[i+3]-b[i+6]; + } + hexad_mul_signed(&bcef,ta,tb); + + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+6]; + tb[i] = b[i]-b[i+6]; + } + hexad_mul_signed(&acdf,ta,tb); + + uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo); + uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi); + uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2; + + uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); + uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); + uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); + + store3(&c[0], out0); + store3(&c[3], out1); + store3(&c[6], out2); +} + + +void +p521_sqr ( + p521_t *__restrict__ cs, + const p521_t *as +) { + uint64_t *c = cs->limb; + const uint64_t *a = as->limb; + + hexad_t ad, be, cf, abde, bcef, acdf; + hexad_sqr(&ad, &a[0]); + hexad_sqr(&be, &a[3]); + hexad_sqr(&cf, &a[6]); + + uint64_t amt = 32; + uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 }, + vhi2 = { 0, 0, -amt<<57, 0 }; + + uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo; + + int64_t ta[3]; + // it seems to be faster not to vectorize these loops + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+3]; + } + hexad_sqr_signed(&abde,ta); + + for (int i=0; i<3; i++) { + ta[i] = a[i+3]-a[i+6]; + } + hexad_sqr_signed(&bcef,ta); + + for (int i=0; i<3; i++) { + ta[i] = a[i]-a[i+6]; + } + hexad_sqr_signed(&acdf,ta); + + uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo); + uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi); + uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2; + + uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); + uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); + uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); + + store3(&c[0], out0); + store3(&c[3], out1); + store3(&c[6], out2); +} + +void +p521_mulw ( + p521_t *__restrict__ cs, + const p521_t *as, + uint64_t b +) { + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + + __uint128_t accum0 = 0, accum3 = 0, accum6 = 0; + uint64_t mask = (1ull<<58) - 1; + + int i; + for (i=0; i<3; i++) { + accum0 += widemul(b, a[LIMBPERM(i)]); + accum3 += widemul(b, a[LIMBPERM(i+3)]); + accum6 += widemul(b, a[LIMBPERM(i+6)]); + c[LIMBPERM(i)] = accum0 & mask; accum0 >>= 58; + c[LIMBPERM(i+3)] = accum3 & mask; accum3 >>= 58; + if (i==2) { + c[LIMBPERM(i+6)] = accum6 & (mask>>1); accum6 >>= 57; + } else { + c[LIMBPERM(i+6)] = accum6 & mask; accum6 >>= 58; + } + } + + accum0 += c[LIMBPERM(3)]; + c[LIMBPERM(3)] = accum0 & mask; + c[LIMBPERM(4)] += accum0 >> 58; + + accum3 += c[LIMBPERM(6)]; + c[LIMBPERM(6)] = accum3 & mask; + c[LIMBPERM(7)] += accum3 >> 58; + + accum6 += c[LIMBPERM(0)]; + c[LIMBPERM(0)] = accum6 & mask; + c[LIMBPERM(1)] += accum6 >> 58; +} + + +void +p521_strong_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; + + /* first, clear high */ + __int128_t scarry = a->limb[LIMBPERM(8)]>>57; + a->limb[LIMBPERM(8)] &= mask2; + + /* now the total is less than 2p */ + + /* compute total_value - p. No need to reduce mod p. */ + + int i; + for (i=0; i<9; i++) { + scarry = scarry + a->limb[LIMBPERM(i)] - ((i==8) ? mask2 : mask); + a->limb[LIMBPERM(i)] = scarry & ((i==8) ? mask2 : mask); + scarry >>= (i==8) ? 57 : 58; + } + + /* uncommon case: it was >= p, so now scarry = 0 and this = x + * common case: it was < p, so now scarry = -1 and this = x - p + 2^521 + * so let's add back in p. will carry back off the top for 2^521. + */ + + assert(is_zero(scarry) | is_zero(scarry+1)); + + uint64_t scarry_mask = scarry & mask; + __uint128_t carry = 0; + + /* add it back */ + for (i=0; i<9; i++) { + carry = carry + a->limb[LIMBPERM(i)] + ((i==8)?(scarry_mask>>1):scarry_mask); + a->limb[LIMBPERM(i)] = carry & ((i==8) ? mask>>1 : mask); + carry >>= (i==8) ? 57 : 58; + } + + assert(is_zero(carry + scarry)); +} + +mask_t +p521_is_zero ( + const struct p521_t *a +) { + struct p521_t b; + p521_copy(&b,a); + p521_strong_reduce(&b); + + uint64_t any = 0; + int i; + for (i=0; i<9; i++) { + any |= b.limb[i]; + } + return is_zero(any); +} + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +) { + int i,k=0; + p521_t red; + p521_copy(&red, x); + p521_strong_reduce(&red); + + uint64_t r=0; + int bits = 0; + for (i=0; i<9; i++) { + r |= red.limb[LIMBPERM(i)] << bits; + for (bits += 58; bits >= 8; bits -= 8) { + serial[k++] = r; + r >>= 8; + } + assert(bits <= 6); + } + assert(bits); + serial[k++] = r; +} + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[LIMBPERM(66)] +) { + int i,k=0,bits=0; + __uint128_t out = 0; + uint64_t mask = (1ull<<58)-1; + for (i=0; i<9; i++) { + out >>= 58; + for (; bits<58; bits+=8) { + out |= ((__uint128_t)serial[k++])<limb[LIMBPERM(i)] = out & mask; + bits -= 58; + } + + /* Check for reduction. First, high has to be < 2^57 */ + mask_t good = is_zero(out>>57); + + uint64_t and = -1ull; + for (i=0; i<8; i++) { + and &= x->limb[i]; + } + and &= (2*out+1); + good &= is_zero((and+1)>>58); + + return good; +} diff --git a/src/p521/arch_x86_64/p521.h b/src/p521/arch_x86_64/p521.h new file mode 100644 index 0000000..c173b9e --- /dev/null +++ b/src/p521/arch_x86_64/p521.h @@ -0,0 +1,247 @@ +/* Copyright (c) 2014 Cryptography Research, Inc. + * Released under the MIT License. See LICENSE.txt for license information. + */ +#ifndef __P521_H__ +#define __P521_H__ 1 + +#include +#include +#include + +#include "word.h" + +#define LIMBPERM(x) (((x)%3)*3 + (x)/3) +#define USE_P521_3x3_TRANSPOSE + +typedef struct p521_t { + uint64_t limb[9]; +} p521_t; + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void +p521_set_ui ( + p521_t *out, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) __attribute__((unused)); + +static __inline__ void +p521_neg ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_addw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_subw ( + p521_t *a, + uint64_t x +) __attribute__((unused)); + +static __inline__ void +p521_copy ( + p521_t *out, + const p521_t *a +) __attribute__((unused)); + +static __inline__ void +p521_weak_reduce ( + p521_t *inout +) __attribute__((unused)); + +void +p521_strong_reduce ( + p521_t *inout +); + +mask_t +p521_is_zero ( + const p521_t *in +); + +static __inline__ void +p521_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +static __inline__ void +p521_really_bias ( + p521_t *inout, + int amount +) __attribute__((unused)); + +void +p521_mul ( + p521_t *__restrict__ out, + const p521_t *a, + const p521_t *b +); + +void +p521_mulw ( + p521_t *__restrict__ out, + const p521_t *a, + uint64_t b +); + +void +p521_sqr ( + p521_t *__restrict__ out, + const p521_t *a +); + +void +p521_serialize ( + uint8_t *serial, + const struct p521_t *x +); + +mask_t +p521_deserialize ( + p521_t *x, + const uint8_t serial[66] +); + +/* -------------- Inline functions begin here -------------- */ + +void +p521_set_ui ( + p521_t *out, + uint64_t x +) { + int i; + out->limb[0] = x; + for (i=1; i<9; i++) { + out->limb[i] = 0; + } +} + +void +p521_add ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] + b->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_sub ( + p521_t *out, + const p521_t *a, + const p521_t *b +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1); + } + p521_weak_reduce(out); +} + +void +p521_neg ( + struct p521_t *out, + const p521_t *a +) { + unsigned int i; + uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; + for (i=0; i<9; i++) { + out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i]; + } + p521_weak_reduce(out); +} + +void +p521_addw ( + p521_t *a, + uint64_t x +) { + a->limb[0] += x; + a->limb[LIMBPERM(1)] += a->limb[0]>>58; + a->limb[0] &= (1ull<<58)-1; +} + +void +p521_subw ( + p521_t *a, + uint64_t x +) { + a->limb[0] -= x; + p521_really_bias(a, 1); + p521_weak_reduce(a); +} + +void +p521_copy ( + p521_t *out, + const p521_t *a +) { + memcpy(out,a,sizeof(*a)); +} + +void +p521_really_bias ( + p521_t *a, + int amt +) { + uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt; + int i; + for (i=0; i<9; i++) { + a->limb[i] += (i==8) ? co2 : co1; + } +} + +void +p521_bias ( + p521_t *a, + int amt +) { + (void) a; + (void) amt; +} + +void +p521_weak_reduce ( + p521_t *a +) { + uint64_t mask = (1ull<<58) - 1; + uint64_t tmp = a->limb[8] >> 57; + int i; + for (i=8; i>0; i--) { + a->limb[LIMBPERM(i)] = (a->limb[LIMBPERM(i)] & ((i==8) ? mask>>1 : mask)) + (a->limb[LIMBPERM(i-1)]>>58); + } + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +#ifdef __cplusplus +}; /* extern "C" */ +#endif + +#endif /* __P521_H__ */ diff --git a/src/p521/magic.c b/src/p521/magic.c index 7a34886..75d93c0 100644 --- a/src/p521/magic.c +++ b/src/p521/magic.c @@ -18,13 +18,13 @@ const uint8_t FIELD_MODULUS[FIELD_BYTES] = { const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = { U64LE(0xbf15dbca0ae7f294), - U60LE(0x04273ba96570e0ba), - U60LE(0xc94750a1813ac0fb), - U60LE(0xea4939b8b9037a08), - U60LE(0x0000000000000002), - U60LE(0x0000000000000000), - U60LE(0x0000000000000000), - U60LE(0x0000000000000000), + U64LE(0x04273ba96570e0ba), + U64LE(0xc94750a1813ac0fb), + U64LE(0xea4939b8b9037a08), + U64LE(0x0000000000000002), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), + U64LE(0x0000000000000000), 0x80, U64LE(0x7e2bb79415cfe529), @@ -40,6 +40,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = { const struct affine_t goldilocks_base_point = { {{ +#ifdef USE_P521_3x3_TRANSPOSE + U58LE(0x02a940a2f19ba6c), + U58LE(0x3331c90d2c6ba52), + U58LE(0x2878a3bfd9f42fc), + U58LE(0x03ec4cd920e2a8c), + U58LE(0x0c6203913f6ecc5), + U58LE(0x06277e432c8a5ac), + U58LE(0x1d568fc99c6059d), + U58LE(0x1b2063b22fcf270), + U58LE(0x0752cb45c48648b) +#else U58LE(0x02a940a2f19ba6c), U58LE(0x03ec4cd920e2a8c), U58LE(0x1d568fc99c6059d), @@ -49,6 +60,7 @@ const struct affine_t goldilocks_base_point = { U58LE(0x2878a3bfd9f42fc), U58LE(0x06277e432c8a5ac), U58LE(0x0752cb45c48648b) +#endif }}, {{ 12 }} }; @@ -69,6 +81,17 @@ const struct barrett_prime_t curve_prime_order = { const struct field_t sqrt_d_minus_1 = {{ +#ifdef USE_P521_3x3_TRANSPOSE + U58LE(0x1e2be72c1c81990), + U58LE(0x207dfc238a33e46), + U58LE(0x2264cfb418c4c30), + U58LE(0x1135002ad596c69), + U58LE(0x0e30107cd79d1f6), + U58LE(0x0524b9e715937f5), + U58LE(0x2ab3a257a22666d), + U58LE(0x2d80cc2936a1824), + U58LE(0x0a9ea3ac10d6aed) +#else U58LE(0x1e2be72c1c81990), U58LE(0x1135002ad596c69), U58LE(0x2ab3a257a22666d), @@ -78,4 +101,5 @@ sqrt_d_minus_1 = {{ U58LE(0x2264cfb418c4c30), U58LE(0x0524b9e715937f5), U58LE(0x0a9ea3ac10d6aed) +#endif }}; diff --git a/src/scalarmul.c b/src/scalarmul.c index 502dd3f..b85a42c 100644 --- a/src/scalarmul.c +++ b/src/scalarmul.c @@ -163,7 +163,9 @@ scalarmul ( copy_tw_extensible(&tabulator, working); double_tw_extensible(&tabulator); - struct tw_pniels_t pn, multiples[NTABLE]; + struct tw_pniels_t + pn VECTOR_ALIGNED, + multiples[NTABLE] VECTOR_ALIGNED; convert_tw_extensible_to_tw_pniels(&pn, &tabulator); convert_tw_extensible_to_tw_pniels(&multiples[0], working); @@ -225,7 +227,9 @@ scalarmul_vlook ( copy_tw_extensible(&tabulator, working); double_tw_extensible(&tabulator); - struct tw_pniels_t pn, multiples[NTABLE]; + struct tw_pniels_t + pn VECTOR_ALIGNED, + multiples[NTABLE] VECTOR_ALIGNED; convert_tw_extensible_to_tw_pniels(&pn, &tabulator); convert_tw_extensible_to_tw_pniels(&multiples[0], working); diff --git a/test/bench.c b/test/bench.c index 399337d..69a5dd0 100644 --- a/test/bench.c +++ b/test/bench.c @@ -535,11 +535,11 @@ int main(int argc, char **argv) { printf("%02x", hsk.opaque[i]); } printf("\nss1 = "); - for (i=0; ilimb[0]); i++) { - int radix_bits = sizeof(x->limb[0]) * FIELD_BITS / sizeof(*x); + int radix_bits = 1 + (sizeof(x->limb[0]) * FIELD_BITS - 1) / sizeof(*x); word_t yardstick = (i==sizeof(*x)/sizeof(x->limb[0])/2) ? (1ull<limb[i] < yardstick * lowBound || x->limb[i] > yardstick * highBound) { @@ -184,6 +184,11 @@ static mask_t test_isr ( return succ; } +void dbg_gmp_printf(const mpz_t x); +void dbg_gmp_printf(const mpz_t x) { + gmp_printf("DEBUG: 0x%Zx\n", x); +} + int test_arithmetic (void) { int j, ntests = 100000; diff --git a/test/test_pointops.c b/test/test_pointops.c index 6d4230d..4f29868 100644 --- a/test/test_pointops.c +++ b/test/test_pointops.c @@ -270,6 +270,11 @@ int test_pointops (void) { for (i=0; i<1000; i++) { uint8_t ser[FIELD_BYTES]; crandom_generate(&crand, ser, sizeof(ser)); + + + #if (FIELD_BITS % 8) + ser[FIELD_BYTES-1] &= (1<<(FIELD_BITS%8)) - 1; + #endif /* TODO: we need a field generate, which can return random or pathological. */ mask_t succ = field_deserialize(&serf, ser); diff --git a/test/test_scalarmul.c b/test/test_scalarmul.c index 80636cf..b1a9c41 100644 --- a/test/test_scalarmul.c +++ b/test/test_scalarmul.c @@ -283,6 +283,19 @@ single_scalarmul_commutativity_test ( } } +static void crandom_generate_f(struct crandom_state_t *crand, uint8_t *scalar, int n) { + crandom_generate(crand, scalar, n); + int i; + for (i = FIELD_BYTES; i= FIELD_BYTES) { + scalar[FIELD_BYTES-1] &= (1<<(FIELD_BITS%8)) - 1; + } +#endif +} + int test_scalarmul_commutativity (void) { int i,j,k,got; @@ -296,7 +309,7 @@ int test_scalarmul_commutativity (void) { for (k=0; k<128 && !got; k++) { uint8_t ser[FIELD_BYTES]; word_t scalar1[SCALAR_WORDS], scalar2[SCALAR_WORDS]; - crandom_generate(&crand, ser, sizeof(ser)); + crandom_generate_f(&crand, ser, sizeof(ser)); crandom_generate(&crand, (uint8_t *)scalar1, sizeof(scalar1)); crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2)); @@ -338,7 +351,7 @@ int test_linear_combo (void) { crandom_generate(&crand, (uint8_t *)scalar2, sizeof(scalar2)); field_t base1; - crandom_generate(&crand, ser, sizeof(ser)); + crandom_generate_f(&crand, ser, sizeof(ser)); mask_t succ = field_deserialize(&base1, ser); if (!succ) continue; @@ -377,7 +390,7 @@ int test_scalarmul_compatibility (void) { for (k=0; k<128 && !got; k++) { uint8_t ser[FIELD_BYTES]; word_t scalar[SCALAR_WORDS]; - crandom_generate(&crand, ser, sizeof(ser)); + crandom_generate_f(&crand, ser, sizeof(ser)); crandom_generate(&crand, (uint8_t *)scalar, sizeof(scalar)); field_t base;