| @@ -19,13 +19,13 @@ ASM ?= $(CC) | |||||
| DECAF ?= decaf_fast | DECAF ?= decaf_fast | ||||
| ifneq (,$(findstring x86_64,$(MACHINE))) | ifneq (,$(findstring x86_64,$(MACHINE))) | ||||
| ARCH ?= arch_x86_64 | |||||
| ARCH ?= arch_ref64 | |||||
| else | else | ||||
| # no i386 port yet | # no i386 port yet | ||||
| ARCH ?= arch_arm_32 | |||||
| ARCH ?= arch_ref32 | |||||
| endif | endif | ||||
| FIELD ?= p255 | |||||
| FIELD ?= p25519 | |||||
| WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ | ||||
| -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) | ||||
| @@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH) | |||||
| LANGFLAGS = -std=c99 -fno-strict-aliasing | LANGFLAGS = -std=c99 -fno-strict-aliasing | ||||
| LANGXXFLAGS = -fno-strict-aliasing | LANGXXFLAGS = -fno-strict-aliasing | ||||
| GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC | ||||
| OFLAGS ?= -O3 | |||||
| OFLAGS ?= -O2 | |||||
| TODAY = $(shell date "+%Y-%m-%d") | TODAY = $(shell date "+%Y-%m-%d") | ||||
| @@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t; | |||||
| /** Galois field element internal structure */ | /** Galois field element internal structure */ | ||||
| typedef struct gf_s { | typedef struct gf_s { | ||||
| decaf_word_t limb[DECAF_255_LIMBS]; | decaf_word_t limb[DECAF_255_LIMBS]; | ||||
| } __attribute__((aligned(32))) gf_s, gf[1]; | |||||
| } gf_s, gf[1]; | |||||
| /** @endcond */ | /** @endcond */ | ||||
| /** Number of bytes in a serialized point. */ | /** Number of bytes in a serialized point. */ | ||||
| @@ -18,7 +18,7 @@ | |||||
| #include "shake.h" | #include "shake.h" | ||||
| /** Number of bytes for a symmetric key (expanded to full key) */ | /** Number of bytes for a symmetric key (expanded to full key) */ | ||||
| #define DECAF_448_SYMMETRIC_KEY_BYTES 32 | |||||
| #define DECAF_255_SYMMETRIC_KEY_BYTES 32 | |||||
| /** @cond internal */ | /** @cond internal */ | ||||
| #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h | #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h | ||||
| @@ -31,29 +31,29 @@ | |||||
| /** @endcond */ | /** @endcond */ | ||||
| /** A symmetric key, the compressed point of a private key. */ | /** A symmetric key, the compressed point of a private key. */ | ||||
| typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES]; | |||||
| typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES]; | |||||
| /** An encoded public key. */ | /** An encoded public key. */ | ||||
| typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES]; | |||||
| typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES]; | |||||
| /** A signature. */ | /** A signature. */ | ||||
| typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES]; | |||||
| typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES]; | |||||
| typedef struct { | typedef struct { | ||||
| /** @cond intetrnal */ | /** @cond intetrnal */ | ||||
| /** The symmetric key from which everything is expanded */ | /** The symmetric key from which everything is expanded */ | ||||
| decaf_448_symmetric_key_t sym; | |||||
| decaf_255_symmetric_key_t sym; | |||||
| /** The scalar x */ | /** The scalar x */ | ||||
| decaf_448_scalar_t secret_scalar; | |||||
| decaf_255_scalar_t secret_scalar; | |||||
| /** x*Base */ | /** x*Base */ | ||||
| decaf_448_public_key_t pub; | |||||
| decaf_255_public_key_t pub; | |||||
| /** @endcond */ | /** @endcond */ | ||||
| } /** Private key structure for pointers. */ | } /** Private key structure for pointers. */ | ||||
| decaf_448_private_key_s, | |||||
| decaf_255_private_key_s, | |||||
| /** A private key (gmp array[1] style). */ | /** A private key (gmp array[1] style). */ | ||||
| decaf_448_private_key_t[1]; | |||||
| decaf_255_private_key_t[1]; | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| @@ -64,16 +64,16 @@ extern "C" { | |||||
| * @param [out] priv The derived private key. | * @param [out] priv The derived private key. | ||||
| * @param [in] proto The compressed or proto-key, which must be 32 random bytes. | * @param [in] proto The compressed or proto-key, which must be 32 random bytes. | ||||
| */ | */ | ||||
| void decaf_448_derive_private_key ( | |||||
| decaf_448_private_key_t priv, | |||||
| const decaf_448_symmetric_key_t proto | |||||
| void decaf_255_derive_private_key ( | |||||
| decaf_255_private_key_t priv, | |||||
| const decaf_255_symmetric_key_t proto | |||||
| ) NONNULL2 API_VIS; | ) NONNULL2 API_VIS; | ||||
| /** | /** | ||||
| * @brief Destroy a private key. | * @brief Destroy a private key. | ||||
| */ | */ | ||||
| void decaf_448_destroy_private_key ( | |||||
| decaf_448_private_key_t priv | |||||
| void decaf_255_destroy_private_key ( | |||||
| decaf_255_private_key_t priv | |||||
| ) NONNULL1 API_VIS; | ) NONNULL1 API_VIS; | ||||
| /** | /** | ||||
| @@ -81,9 +81,9 @@ void decaf_448_destroy_private_key ( | |||||
| * @param [out] pub The extracted private key. | * @param [out] pub The extracted private key. | ||||
| * @param [in] priv The private key. | * @param [in] priv The private key. | ||||
| */ | */ | ||||
| void decaf_448_private_to_public ( | |||||
| decaf_448_public_key_t pub, | |||||
| const decaf_448_private_key_t priv | |||||
| void decaf_255_private_to_public ( | |||||
| decaf_255_public_key_t pub, | |||||
| const decaf_255_private_key_t priv | |||||
| ) NONNULL2 API_VIS; | ) NONNULL2 API_VIS; | ||||
| /** | /** | ||||
| @@ -104,11 +104,11 @@ void decaf_448_private_to_public ( | |||||
| * and will almost definitely change in the future. | * and will almost definitely change in the future. | ||||
| */ | */ | ||||
| decaf_bool_t | decaf_bool_t | ||||
| decaf_448_shared_secret ( | |||||
| decaf_255_shared_secret ( | |||||
| uint8_t *shared, | uint8_t *shared, | ||||
| size_t shared_bytes, | size_t shared_bytes, | ||||
| const decaf_448_private_key_t my_privkey, | |||||
| const decaf_448_public_key_t your_pubkey | |||||
| const decaf_255_private_key_t my_privkey, | |||||
| const decaf_255_public_key_t your_pubkey | |||||
| ) NONNULL134 WARN_UNUSED API_VIS; | ) NONNULL134 WARN_UNUSED API_VIS; | ||||
| /** | /** | ||||
| @@ -119,9 +119,9 @@ decaf_448_shared_secret ( | |||||
| * @param [in] shake A SHAKE256 context with the message. | * @param [in] shake A SHAKE256 context with the message. | ||||
| */ | */ | ||||
| void | void | ||||
| decaf_448_sign_shake ( | |||||
| decaf_448_signature_t sig, | |||||
| const decaf_448_private_key_t priv, | |||||
| decaf_255_sign_shake ( | |||||
| decaf_255_signature_t sig, | |||||
| const decaf_255_private_key_t priv, | |||||
| const keccak_sponge_t shake | const keccak_sponge_t shake | ||||
| ) NONNULL3 API_VIS; | ) NONNULL3 API_VIS; | ||||
| @@ -134,9 +134,9 @@ decaf_448_sign_shake ( | |||||
| * @param [in] message_len The message's length. | * @param [in] message_len The message's length. | ||||
| */ | */ | ||||
| void | void | ||||
| decaf_448_sign ( | |||||
| decaf_448_signature_t sig, | |||||
| const decaf_448_private_key_t priv, | |||||
| decaf_255_sign ( | |||||
| decaf_255_signature_t sig, | |||||
| const decaf_255_private_key_t priv, | |||||
| const unsigned char *message, | const unsigned char *message, | ||||
| size_t message_len | size_t message_len | ||||
| ) NONNULL3 API_VIS; | ) NONNULL3 API_VIS; | ||||
| @@ -149,9 +149,9 @@ decaf_448_sign ( | |||||
| * @param [in] shake A SHAKE256 context with the message. | * @param [in] shake A SHAKE256 context with the message. | ||||
| */ | */ | ||||
| decaf_bool_t | decaf_bool_t | ||||
| decaf_448_verify_shake ( | |||||
| const decaf_448_signature_t sig, | |||||
| const decaf_448_public_key_t pub, | |||||
| decaf_255_verify_shake ( | |||||
| const decaf_255_signature_t sig, | |||||
| const decaf_255_public_key_t pub, | |||||
| const keccak_sponge_t shake | const keccak_sponge_t shake | ||||
| ) NONNULL3 API_VIS WARN_UNUSED; | ) NONNULL3 API_VIS WARN_UNUSED; | ||||
| @@ -164,9 +164,9 @@ decaf_448_verify_shake ( | |||||
| * @param [in] message_len The message's length. | * @param [in] message_len The message's length. | ||||
| */ | */ | ||||
| decaf_bool_t | decaf_bool_t | ||||
| decaf_448_verify ( | |||||
| const decaf_448_signature_t sig, | |||||
| const decaf_448_public_key_t pub, | |||||
| decaf_255_verify ( | |||||
| const decaf_255_signature_t sig, | |||||
| const decaf_255_public_key_t pub, | |||||
| const unsigned char *message, | const unsigned char *message, | ||||
| size_t message_len | size_t message_len | ||||
| ) NONNULL3 API_VIS WARN_UNUSED; | ) NONNULL3 API_VIS WARN_UNUSED; | ||||
| @@ -192,18 +192,18 @@ private: | |||||
| }; | }; | ||||
| /**@cond internal*/ | /**@cond internal*/ | ||||
| inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||||
| inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT { | |||||
| *this = rng.read(SER_BYTES); | *this = rng.read(SER_BYTES); | ||||
| } | } | ||||
| inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||||
| inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT { | |||||
| SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); | SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES); | ||||
| rng.read(buffer); | rng.read(buffer); | ||||
| set_to_hash(buffer); | set_to_hash(buffer); | ||||
| } | } | ||||
| inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||||
| inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT { | |||||
| SecureBuffer out(STEG_BYTES); | SecureBuffer out(STEG_BYTES); | ||||
| bool done; | bool done; | ||||
| do { | do { | ||||
| @@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t; | |||||
| #define siv static inline void __attribute__((always_inline)) | #define siv static inline void __attribute__((always_inline)) | ||||
| static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; | static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}}; | ||||
| static const int EDWARDS_D = 121665; | |||||
| static const int EDWARDS_D = -89747; | |||||
| // Gonna test with PinkBikeShed until the math works... | |||||
| // Curve25519: 121665; | |||||
| static const scalar_t sc_p = {{{ | static const scalar_t sc_p = {{{ | ||||
| // Gonna test with PinkBikeShed until the math works... | |||||
| SC_LIMB(0xb6b98fd8849faf35), | |||||
| SC_LIMB(0x16241e6093b2ce59), | |||||
| SC_LIMB(0), | |||||
| SC_LIMB(0x2000000000000000) | |||||
| /* Curve25519: | |||||
| SC_LIMB(0x5812631a5cf5d3ed), | SC_LIMB(0x5812631a5cf5d3ed), | ||||
| SC_LIMB(0x14def9dea2f79cd6), | SC_LIMB(0x14def9dea2f79cd6), | ||||
| SC_LIMB(0), | SC_LIMB(0), | ||||
| SC_LIMB(0), | |||||
| SC_LIMB(0x1000000000000000) | SC_LIMB(0x1000000000000000) | ||||
| */ | |||||
| }}}; | }}}; | ||||
| const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; | const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}}; | ||||
| @@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR; | |||||
| /* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ | /* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */ | ||||
| const unsigned char base_point_ser_for_pregen[SER_BYTES] = { | const unsigned char base_point_ser_for_pregen[SER_BYTES] = { | ||||
| 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||||
| 5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||||
| }; | }; | ||||
| extern const point_t API_NS(point_base); | extern const point_t API_NS(point_base); | ||||
| @@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32; | |||||
| #ifdef __clang__ | #ifdef __clang__ | ||||
| #if 100*__clang_major__ + __clang_minor__ > 305 | #if 100*__clang_major__ + __clang_minor__ > 305 | ||||
| #define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)") | |||||
| #define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize? | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifndef VECTORIZE | |||||
| #define VECTORIZE | |||||
| #ifndef UNROLL | |||||
| #define UNROLL | |||||
| #endif | #endif | ||||
| #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} | #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }} | ||||
| #define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }} | |||||
| #define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }} | |||||
| /** Copy x = y */ | /** Copy x = y */ | ||||
| siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } | siv gf_cpy(gf x, const gf y) { x[0] = y[0]; } | ||||
| @@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) { | |||||
| /** Subtract mod p. Bias by 2 and don't reduce */ | /** Subtract mod p. Bias by 2 and don't reduce */ | ||||
| siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { | siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) { | ||||
| // FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||||
| // FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] ); | |||||
| ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ||||
| field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | ||||
| gf_bias(c, 2); | gf_bias(c, 2); | ||||
| @@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) { | |||||
| /** Add mod p. Don't reduce. */ | /** Add mod p. Don't reduce. */ | ||||
| siv gf_add_nr ( gf c, const gf a, const gf b ) { | siv gf_add_nr ( gf c, const gf a, const gf b ) { | ||||
| // FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||||
| // FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]); | |||||
| ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO | ||||
| field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b); | ||||
| } | } | ||||
| @@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) { | |||||
| /** Constant time, if (swap) (x,y) = (y,x); */ | /** Constant time, if (swap) (x,y) = (y,x); */ | ||||
| siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) { | ||||
| FOR_LIMB_V(i, { | |||||
| FOR_LIMB_U(i, { | |||||
| decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap; | ||||
| x->limb[i] ^= s; | x->limb[i] ^= s; | ||||
| y->limb[i] ^= s; | y->limb[i] ^= s; | ||||
| @@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) ( | |||||
| } | } | ||||
| return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); | return ~API_NS(scalar_eq)(out,API_NS(scalar_zero)); | ||||
| #else | #else | ||||
| (void)out; | |||||
| (void)a; | |||||
| return 0; | |||||
| decaf_255_scalar_t b, ma; | |||||
| int i; | |||||
| sc_montmul(b,API_NS(scalar_one),sc_r2); | |||||
| sc_montmul(ma,a,sc_r2); | |||||
| for (i=SCALAR_BITS-1; i>=0; i--) { | |||||
| sc_montsqr(b,b); | |||||
| decaf_word_t w = sc_p->limb[i/WBITS]; | |||||
| if (i<WBITS) { | |||||
| assert(w >= 2); | |||||
| w-=2; | |||||
| } | |||||
| if (1 & w>>(i%WBITS)) { | |||||
| sc_montmul(b,b,ma); | |||||
| } | |||||
| } | |||||
| sc_montmul(out,b,decaf_255_scalar_one); | |||||
| API_NS(scalar_destroy)(b); | |||||
| API_NS(scalar_destroy)(ma); | |||||
| return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -0,0 +1,50 @@ | |||||
| /** | |||||
| * @file decaf_config.h | |||||
| * @author Mike Hamburg | |||||
| * | |||||
| * @copyright | |||||
| * Copyright (c) 2015 Cryptography Research, Inc. \n | |||||
| * Released under the MIT License. See LICENSE.txt for license information. | |||||
| * | |||||
| * @brief Configuration for decaf_fast.c | |||||
| */ | |||||
| #ifndef __DECAF_255_CONFIG_H__ | |||||
| #define __DECAF_255_CONFIG_H__ 1 | |||||
| /** | |||||
| * Use the Montgomery ladder for direct scalarmul. | |||||
| * | |||||
| * The Montgomery ladder is faster than Edwards scalarmul, but providing | |||||
| * the features Decaf supports (cofactor elimination, twist rejection) | |||||
| * makes it complicated and adds code. Removing the ladder saves a few | |||||
| * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul | |||||
| * time. | |||||
| */ | |||||
| #define DECAF_USE_MONTGOMERY_LADDER 1 | |||||
| /** The number of comb tables for fixed base scalarmul. */ | |||||
| #define DECAF_COMBS_N 3 | |||||
| /** The number of teeth per comb for fixed base scalarmul. */ | |||||
| #define DECAF_COMBS_T 5 | |||||
| /** The comb spacing fixed base scalarmul. */ | |||||
| #define DECAF_COMBS_S 17 | |||||
| /** Performance tuning: the width of the fixed window for scalar mul. */ | |||||
| #define DECAF_WINDOW_BITS 4 | |||||
| /** | |||||
| * The number of bits used for the precomputed table in variable-time | |||||
| * double scalarmul. | |||||
| */ | |||||
| #define DECAF_WNAF_FIXED_TABLE_BITS 5 | |||||
| /** | |||||
| * Performance tuning: bits used for the variable table in variable-time | |||||
| * double scalarmul. | |||||
| */ | |||||
| #define DECAF_WNAF_VAR_TABLE_BITS 3 | |||||
| #endif /* __DECAF_255_CONFIG_H__ */ | |||||
| @@ -22,164 +22,33 @@ p255_mul ( | |||||
| const p255_t *as, | const p255_t *as, | ||||
| const p255_t *bs | const p255_t *bs | ||||
| ) { | ) { | ||||
| const uint64_t *a = as->limb, *b = bs->limb; | |||||
| const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1); | |||||
| uint64_t bh[4]; | |||||
| int i,j; | |||||
| for (i=0; i<4; i++) bh[i] = b[i+1] * 19; | |||||
| uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
| __uint128_t accum0 = 0, accum1 = 0, accum2; | |||||
| uint64_t mask = (1ull<<51) - 1; | |||||
| uint64_t aa[4], bb[4], bbb[4]; | |||||
| unsigned int i; | |||||
| for (i=0; i<4; i++) { | |||||
| aa[i] = a[i] + a[i+4]; | |||||
| bb[i] = b[i] + b[i+4]; | |||||
| bbb[i] = bb[i] + b[i+4]; | |||||
| } | |||||
| int I_HATE_UNROLLED_LOOPS = 0; | |||||
| if (I_HATE_UNROLLED_LOOPS) { | |||||
| /* The compiler probably won't unroll this, | |||||
| * so it's like 80% slower. | |||||
| */ | |||||
| for (i=0; i<4; i++) { | |||||
| accum2 = 0; | |||||
| unsigned int j; | |||||
| for (j=0; j<=i; j++) { | |||||
| accum2 += widemul(a[j], b[i-j]); | |||||
| accum1 += widemul(aa[j], bb[i-j]); | |||||
| accum0 += widemul(a[j+4], b[i-j+4]); | |||||
| } | |||||
| for (; j<4; j++) { | |||||
| accum2 += widemul(a[j], b[i-j+8]); | |||||
| accum1 += widemul(aa[j], bbb[i-j+4]); | |||||
| accum0 += widemul(a[j+4], bb[i-j+4]); | |||||
| } | |||||
| accum1 -= accum2; | |||||
| accum0 += accum2; | |||||
| c[i] = ((uint64_t)(accum0)) & mask; | |||||
| c[i+4] = ((uint64_t)(accum1)) & mask; | |||||
| accum0 >>= 56; | |||||
| accum1 >>= 56; | |||||
| __uint128_t accum = 0; | |||||
| for (i=0; i<5; i++) { | |||||
| for (j=0; j<=i; j++) { | |||||
| accum += widemul(b[i-j], a[j]); | |||||
| } | } | ||||
| } else { | |||||
| accum2 = widemul(a[0], b[0]); | |||||
| accum1 += widemul(aa[0], bb[0]); | |||||
| accum0 += widemul(a[4], b[4]); | |||||
| accum2 += widemul(a[1], b[7]); | |||||
| accum1 += widemul(aa[1], bbb[3]); | |||||
| accum0 += widemul(a[5], bb[3]); | |||||
| accum2 += widemul(a[2], b[6]); | |||||
| accum1 += widemul(aa[2], bbb[2]); | |||||
| accum0 += widemul(a[6], bb[2]); | |||||
| accum2 += widemul(a[3], b[5]); | |||||
| accum1 += widemul(aa[3], bbb[1]); | |||||
| accum0 += widemul(a[7], bb[1]); | |||||
| accum1 -= accum2; | |||||
| accum0 += accum2; | |||||
| c[0] = ((uint64_t)(accum0)) & mask; | |||||
| c[4] = ((uint64_t)(accum1)) & mask; | |||||
| accum0 >>= 56; | |||||
| accum1 >>= 56; | |||||
| accum2 = widemul(a[0], b[1]); | |||||
| accum1 += widemul(aa[0], bb[1]); | |||||
| accum0 += widemul(a[4], b[5]); | |||||
| accum2 += widemul(a[1], b[0]); | |||||
| accum1 += widemul(aa[1], bb[0]); | |||||
| accum0 += widemul(a[5], b[4]); | |||||
| accum2 += widemul(a[2], b[7]); | |||||
| accum1 += widemul(aa[2], bbb[3]); | |||||
| accum0 += widemul(a[6], bb[3]); | |||||
| accum2 += widemul(a[3], b[6]); | |||||
| accum1 += widemul(aa[3], bbb[2]); | |||||
| accum0 += widemul(a[7], bb[2]); | |||||
| accum1 -= accum2; | |||||
| accum0 += accum2; | |||||
| c[1] = ((uint64_t)(accum0)) & mask; | |||||
| c[5] = ((uint64_t)(accum1)) & mask; | |||||
| accum0 >>= 56; | |||||
| accum1 >>= 56; | |||||
| accum2 = widemul(a[0], b[2]); | |||||
| accum1 += widemul(aa[0], bb[2]); | |||||
| accum0 += widemul(a[4], b[6]); | |||||
| accum2 += widemul(a[1], b[1]); | |||||
| accum1 += widemul(aa[1], bb[1]); | |||||
| accum0 += widemul(a[5], b[5]); | |||||
| accum2 += widemul(a[2], b[0]); | |||||
| accum1 += widemul(aa[2], bb[0]); | |||||
| accum0 += widemul(a[6], b[4]); | |||||
| accum2 += widemul(a[3], b[7]); | |||||
| accum1 += widemul(aa[3], bbb[3]); | |||||
| accum0 += widemul(a[7], bb[3]); | |||||
| accum1 -= accum2; | |||||
| accum0 += accum2; | |||||
| c[2] = ((uint64_t)(accum0)) & mask; | |||||
| c[6] = ((uint64_t)(accum1)) & mask; | |||||
| accum0 >>= 56; | |||||
| accum1 >>= 56; | |||||
| accum2 = widemul(a[0], b[3]); | |||||
| accum1 += widemul(aa[0], bb[3]); | |||||
| accum0 += widemul(a[4], b[7]); | |||||
| accum2 += widemul(a[1], b[2]); | |||||
| accum1 += widemul(aa[1], bb[2]); | |||||
| accum0 += widemul(a[5], b[6]); | |||||
| accum2 += widemul(a[2], b[1]); | |||||
| accum1 += widemul(aa[2], bb[1]); | |||||
| accum0 += widemul(a[6], b[5]); | |||||
| accum2 += widemul(a[3], b[0]); | |||||
| accum1 += widemul(aa[3], bb[0]); | |||||
| accum0 += widemul(a[7], b[4]); | |||||
| accum1 -= accum2; | |||||
| accum0 += accum2; | |||||
| c[3] = ((uint64_t)(accum0)) & mask; | |||||
| c[7] = ((uint64_t)(accum1)) & mask; | |||||
| accum0 >>= 56; | |||||
| accum1 >>= 56; | |||||
| } /* !I_HATE_UNROLLED_LOOPS */ | |||||
| accum0 += accum1; | |||||
| accum0 += c[4]; | |||||
| accum1 += c[0]; | |||||
| c[4] = ((uint64_t)(accum0)) & mask; | |||||
| c[0] = ((uint64_t)(accum1)) & mask; | |||||
| accum0 >>= 56; | |||||
| accum1 >>= 56; | |||||
| c[5] += ((uint64_t)(accum0)); | |||||
| c[1] += ((uint64_t)(accum1)); | |||||
| for (; j<5; j++) { | |||||
| accum += widemul(bh[i-j+4], a[j]); | |||||
| } | |||||
| c[i] = accum & mask; | |||||
| accum >>= 51; | |||||
| } | |||||
| /* PERF: parallelize? eh well this is reference */ | |||||
| accum *= 19; | |||||
| accum += c[0]; | |||||
| c[0] = accum & mask; | |||||
| accum >>= 51; | |||||
| assert(accum < mask); | |||||
| c[1] += accum; | |||||
| } | } | ||||
| void | void | ||||
| @@ -188,27 +57,25 @@ p255_mulw ( | |||||
| const p255_t *as, | const p255_t *as, | ||||
| uint64_t b | uint64_t b | ||||
| ) { | ) { | ||||
| const uint64_t *a = as->limb; | |||||
| const uint64_t *a = as->limb, mask = ((1ull<<51)-1); | |||||
| int i; | |||||
| uint64_t *c = cs->limb; | uint64_t *c = cs->limb; | ||||
| __uint128_t accum0 = 0, accum4 = 0; | |||||
| uint64_t mask = (1ull<<56) - 1; | |||||
| int i; | |||||
| for (i=0; i<4; i++) { | |||||
| accum0 += widemul(b, a[i]); | |||||
| accum4 += widemul(b, a[i+4]); | |||||
| c[i] = accum0 & mask; accum0 >>= 56; | |||||
| c[i+4] = accum4 & mask; accum4 >>= 56; | |||||
| __uint128_t accum = 0; | |||||
| for (i=0; i<5; i++) { | |||||
| accum += widemul(b, a[i]); | |||||
| c[i] = accum & mask; | |||||
| accum >>= 51; | |||||
| } | } | ||||
| /* PERF: parallelize? eh well this is reference */ | |||||
| accum *= 19; | |||||
| accum += c[0]; | |||||
| c[0] = accum & mask; | |||||
| accum >>= 51; | |||||
| accum0 += accum4 + c[4]; | |||||
| c[4] = accum0 & mask; | |||||
| c[5] += accum0 >> 56; | |||||
| accum4 += c[0]; | |||||
| c[0] = accum4 & mask; | |||||
| c[1] += accum4 >> 56; | |||||
| assert(accum < mask); | |||||
| c[1] += accum; | |||||
| } | } | ||||
| void | void | ||||
| @@ -223,23 +90,21 @@ void | |||||
| p255_strong_reduce ( | p255_strong_reduce ( | ||||
| p255_t *a | p255_t *a | ||||
| ) { | ) { | ||||
| uint64_t mask = (1ull<<56)-1; | |||||
| uint64_t mask = (1ull<<51)-1; | |||||
| /* first, clear high */ | /* first, clear high */ | ||||
| a->limb[4] += a->limb[7]>>56; | |||||
| a->limb[0] += a->limb[7]>>56; | |||||
| a->limb[7] &= mask; | |||||
| a->limb[0] += (a->limb[4]>>51)*19; | |||||
| a->limb[4] &= mask; | |||||
| /* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */ | |||||
| /* now the total is less than 2p */ | |||||
| /* compute total_value - p. No need to reduce mod p. */ | /* compute total_value - p. No need to reduce mod p. */ | ||||
| __int128_t scarry = 0; | __int128_t scarry = 0; | ||||
| int i; | int i; | ||||
| for (i=0; i<8; i++) { | |||||
| scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); | |||||
| for (i=0; i<5; i++) { | |||||
| scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask); | |||||
| a->limb[i] = scarry & mask; | a->limb[i] = scarry & mask; | ||||
| scarry >>= 56; | |||||
| scarry >>= 51; | |||||
| } | } | ||||
| /* uncommon case: it was >= p, so now scarry = 0 and this = x | /* uncommon case: it was >= p, so now scarry = 0 and this = x | ||||
| @@ -253,10 +118,10 @@ p255_strong_reduce ( | |||||
| __uint128_t carry = 0; | __uint128_t carry = 0; | ||||
| /* add it back */ | /* add it back */ | ||||
| for (i=0; i<8; i++) { | |||||
| carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); | |||||
| for (i=0; i<5; i++) { | |||||
| carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask); | |||||
| a->limb[i] = carry & mask; | a->limb[i] = carry & mask; | ||||
| carry >>= 56; | |||||
| carry >>= 51; | |||||
| } | } | ||||
| assert(is_zero(carry + scarry)); | assert(is_zero(carry + scarry)); | ||||
| @@ -271,12 +136,13 @@ p255_serialize ( | |||||
| p255_t red; | p255_t red; | ||||
| p255_copy(&red, x); | p255_copy(&red, x); | ||||
| p255_strong_reduce(&red); | p255_strong_reduce(&red); | ||||
| for (i=0; i<8; i++) { | |||||
| for (j=0; j<7; j++) { | |||||
| serial[7*i+j] = red.limb[i]; | |||||
| red.limb[i] >>= 8; | |||||
| uint64_t *r = red.limb; | |||||
| uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12}; | |||||
| for (i=0; i<4; i++) { | |||||
| for (j=0; j<8; j++) { | |||||
| serial[8*i+j] = ser64[i]; | |||||
| ser64[i] >>= 8; | |||||
| } | } | ||||
| assert(red.limb[i] == 0); | |||||
| } | } | ||||
| } | } | ||||
| @@ -286,33 +152,27 @@ p255_deserialize ( | |||||
| const uint8_t serial[32] | const uint8_t serial[32] | ||||
| ) { | ) { | ||||
| int i,j; | int i,j; | ||||
| for (i=0; i<8; i++) { | |||||
| uint64_t ser64[4], mask = ((1ull<<51)-1); | |||||
| for (i=0; i<4; i++) { | |||||
| uint64_t out = 0; | uint64_t out = 0; | ||||
| for (j=0; j<7; j++) { | |||||
| out |= ((uint64_t)serial[7*i+j])<<(8*j); | |||||
| for (j=0; j<8; j++) { | |||||
| out |= ((uint64_t)serial[8*i+j])<<(8*j); | |||||
| } | } | ||||
| x->limb[i] = out; | |||||
| ser64[i] = out; | |||||
| } | } | ||||
| /* Check for reduction. | |||||
| * | |||||
| * The idea is to create a variable ge which is all ones (rather, 56 ones) | |||||
| * if and only if the low $i$ words of $x$ are >= those of p. | |||||
| * | |||||
| * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) | |||||
| */ | |||||
| uint64_t ge = -1, mask = (1ull<<56)-1; | |||||
| for (i=0; i<4; i++) { | |||||
| ge &= x->limb[i]; | |||||
| } | |||||
| /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ | |||||
| ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); | |||||
| /* Test for >= 2^255-19 */ | |||||
| uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64); | |||||
| ge &= ser64[1]; | |||||
| ge &= ser64[2]; | |||||
| ge &= (ser64[3]<<1) + 1; | |||||
| ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64); | |||||
| /* Propagate the rest */ | |||||
| for (i=5; i<8; i++) { | |||||
| ge &= x->limb[i]; | |||||
| } | |||||
| x->limb[0] = ser64[0] & mask; | |||||
| x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask; | |||||
| x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask; | |||||
| x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask; | |||||
| x->limb[4] = ser64[3]>>12; | |||||
| return ~is_zero(ge ^ mask); | |||||
| return ~is_zero(~ge); | |||||
| } | } | ||||
| @@ -15,7 +15,17 @@ typedef struct p255_t { | |||||
| } p255_t; | } p255_t; | ||||
| #define LBITS 51 | #define LBITS 51 | ||||
| #define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}} | |||||
| #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }} | |||||
| /* | |||||
| #define FIELD_LITERAL(a,b,c,d) {{ \ | |||||
| (a##ull) & LMASK, \ | |||||
| ((a##ull)>>51 | (b##ull)<<13) & LMASK, \ | |||||
| ((b##ull)>>38 | (c##ull)<<26) & LMASK, \ | |||||
| ((c##ull)>>25 | (d##ull)<<39) & LMASK, \ | |||||
| (d##ull)>>12 \ | |||||
| }} | |||||
| */ | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| @@ -140,9 +150,9 @@ p255_weak_reduce ( | |||||
| p255_t *a | p255_t *a | ||||
| ) { | ) { | ||||
| uint64_t mask = (1ull<<51) - 1; | uint64_t mask = (1ull<<51) - 1; | ||||
| uint64_t tmp = a->limb[5] >> 51; | |||||
| uint64_t tmp = a->limb[4] >> 51; | |||||
| int i; | int i; | ||||
| for (i=7; i>0; i--) { | |||||
| for (i=4; i>0; i--) { | |||||
| a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51); | ||||
| } | } | ||||
| a->limb[0] = (a->limb[0] & mask) + tmp*19; | a->limb[0] = (a->limb[0] & mask) + tmp*19; | ||||
| @@ -10,58 +10,51 @@ | |||||
| #include "field.h" | #include "field.h" | ||||
| extern field_a_t ONE; // TODO | |||||
| static const field_a_t SQRT_MINUS_ONE = FIELD_LITERAL( // FIXME goes elsewhere? | |||||
| static const field_a_t SQRT_MINUS_ONE = {FIELD_LITERAL( // FIXME goes elsewhere? | |||||
| 0x61b274a0ea0b0, | 0x61b274a0ea0b0, | ||||
| 0x0d5a5fc8f189d, | 0x0d5a5fc8f189d, | ||||
| 0x7ef5e9cbd0c60, | 0x7ef5e9cbd0c60, | ||||
| 0x78595a6804c9e, | 0x78595a6804c9e, | ||||
| 0x2b8324804fc1d | 0x2b8324804fc1d | ||||
| ); | |||||
| )}; | |||||
| static const field_a_t ONE = {FIELD_LITERAL( // FIXME copy-pasted | |||||
| 1,0,0,0,0 | |||||
| )}; | |||||
| void | |||||
| field_isr ( | |||||
| field_a_t a, | |||||
| const field_a_t x | |||||
| ) { | |||||
| field_a_t st[3], tmp1, tmp2; | |||||
| const struct { unsigned char sh, idx } ops[] = { | |||||
| {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | |||||
| }; | |||||
| field_cpy(st[0],x); | |||||
| field_cpy(st[1],x); | |||||
| field_cpy(st[2],x); | |||||
| // ARCH MAGIC FIXME copy-pasted from decaf_fast.c | |||||
| static mask_t gf_eq(const field_a_t a, const field_a_t b) { | |||||
| field_a_t c; | |||||
| field_sub(c,a,b); | |||||
| field_strong_reduce(c); | |||||
| mask_t ret=0; | |||||
| int i; | int i; | ||||
| for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | |||||
| field_sqrn(tmp1, st[1^i&1], ops[i].sh); | |||||
| field_mul(tmp2, tmp1, st[ops[i].idx]); | |||||
| field_cpy(st[i&1], tmp2); | |||||
| } | |||||
| mask_t m = field_eq(st[1], ONE); | |||||
| cond_sel(tmp1,SQRT_MINUS_ONE,ONE,m); | |||||
| field_mul(a,tmp1,st[0]); | |||||
| }; | |||||
| for (i=0; i<5; i++) { ret |= c->limb[i]; } | |||||
| return ((__uint128_t)ret - 1) >> 64; | |||||
| } | |||||
| /* Guarantee: a^2 x = 0 if x = 0; else a^2 x = 1 or SQRT_MINUS_ONE; */ | |||||
| void | void | ||||
| field_isr ( | field_isr ( | ||||
| field_a_t a, | field_a_t a, | ||||
| const field_a_t x | const field_a_t x | ||||
| ) { | ) { | ||||
| field_a_t st[3], tmp1, tmp2; | field_a_t st[3], tmp1, tmp2; | ||||
| const struct { unsigned char sh, idx } ops[] = { | |||||
| const struct { unsigned char sh, idx; } ops[] = { | |||||
| {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | {1,2},{1,2},{3,1},{6,0},{1,2},{12,1},{25,1},{25,1},{50,0},{125,0},{2,2},{1,2} | ||||
| }; | }; | ||||
| field_cpy(st[0],x); | |||||
| field_cpy(st[1],x); | |||||
| field_cpy(st[2],x); | |||||
| int i; | |||||
| st[0][0] = st[1][0] = st[2][0] = x[0]; | |||||
| unsigned int i; | |||||
| for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | for (i=0; i<sizeof(ops)/sizeof(ops[0]); i++) { | ||||
| field_sqrn(tmp1, st[1^i&1], ops[i].sh); | field_sqrn(tmp1, st[1^i&1], ops[i].sh); | ||||
| field_mul(tmp2, tmp1, st[ops[i].idx]); | field_mul(tmp2, tmp1, st[ops[i].idx]); | ||||
| field_cpy(st[i&1], tmp2); | |||||
| st[i&1][0] = tmp2[0]; | |||||
| } | } | ||||
| mask_t m = field_eq(st[1], ONE); | |||||
| mask_t mask = gf_eq(st[1],ONE) | gf_eq(st[1],SQRT_MINUS_ONE); | |||||
| // ARCH MAGIC FIXME: should be cond_sel | |||||
| for (i=0; i<5; i++) tmp1->limb[i] = (ONE->limb[i] & mask) | |||||
| | (SQRT_MINUS_ONE->limb[i] & ~mask); | |||||
| field_mul(a,tmp1,st[0]); | |||||
| } | } | ||||