Faster scalarmul is ported from Goldilocks, modulo a bit of magic. Of course, it's the one that doesn't matter as much because we have the monty ladder. Next up, port wNAF and recomputation?

11 years ago · 7c8a2a72c5
--- a/src/decaf_fast.c
+++ b/src/decaf_fast.c
@@ -548,7 +548,11 @@ void decaf_448_point_add (
    gf_mul ( p->t, b, c );
 }

 void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) {
 static void decaf_448_point_double_internal (
    decaf_448_point_t p,
    const decaf_448_point_t q,
    decaf_bool_t before_double
 ) {
    gf a, b, c, d;
    gf_sqr ( c, q->x );
    gf_sqr ( a, q->y );
@@ -563,8 +567,11 @@ void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) {
    gf_mul ( p->x, a, b );
    gf_mul ( p->z, p->t, a );
    gf_mul ( p->y, p->t, d );
    /* TODO: conditional? */
    gf_mul ( p->t, b, d );
    if (!before_double) gf_mul ( p->t, b, d );
 }

 void decaf_448_point_double(decaf_448_point_t p, const decaf_448_point_t q) {
    decaf_448_point_double_internal(p,q,0);
 }

 void decaf_448_point_copy (
@@ -734,7 +741,8 @@ static void pniels_to_pt (

 static void add_niels_to_pt (
    decaf_448_point_t d,
    const niels_t e
    const niels_t e,
    decaf_bool_t before_double
 ) {
    gf a, b, c;
    gf_sub_nr ( b, d->y, d->x );
@@ -749,18 +757,18 @@ static void add_niels_to_pt (
    gf_mul ( d->z, a, d->y );
    gf_mul ( d->x, d->y, b );
    gf_mul ( d->y, a, c );
    /* TODO: if... */
    gf_mul ( d->t, b, c );
    if (!before_double) gf_mul ( d->t, b, c );
 }

 static void add_pniels_to_pt (
    decaf_448_point_t p,
    const pniels_t pn
    const pniels_t pn,
    decaf_bool_t before_double
 ) {
    gf L0;
    gf_mul ( L0, p->z, pn->z );
    gf_cpy ( p->z, L0 );
    add_niels_to_pt( p, pn->n );
    add_niels_to_pt( p, pn->n, before_double );
 }

 void decaf_448_point_scalarmul (
@@ -768,64 +776,73 @@ void decaf_448_point_scalarmul (
    const decaf_448_point_t b,
    const decaf_448_scalar_t scalar
 ) {
    const int WINDOW = 5,
    const int WINDOW = 5, /* PERF: Make 4 on non hugevector platforms? */
        WINDOW_MASK = (1<<WINDOW)-1,
        WINDOW_U_MASK = (1<<((448%WINDOW)-1))-1,
        WINDOW_T_MASK = WINDOW_MASK >> 1,
        NTABLE = 1<<(WINDOW-1);
    
    decaf_448_scalar_t scalar2, onehalf = {{{0}}}, arrr;
    /* Adjust the scalar to SABS window.  TODO: optimize, subroutinize */
    decaf_448_scalar_t scalar2, onehalf = {{{0}}}, two = {{{2}}}, arrr;
    onehalf->limb[SCALAR_WORDS-1] = 1ull<<(WBITS-1);
    
    /* FIXME PERF MAGIC precompute 2^449-1/2 mod q.  Could instead use 2^446-1/2 mod q though. */
    decaf_448_montmul(arrr,two,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

    /* PERF dedicated halve */
    decaf_448_scalar_sub(scalar2, scalar, decaf_448_scalar_one);
    decaf_448_montmul(scalar2,scalar2,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

    /* FIXME PERF precompute 2^447-1/2 mod q.  Could instead use 2^446-1/2 mod q though. */
    decaf_448_montmul(arrr,decaf_448_scalar_one,decaf_448_scalar_r2,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);
    decaf_448_montmul(arrr,arrr,onehalf,decaf_448_scalar_p,DECAF_MONTGOMERY_FACTOR);

    decaf_448_scalar_add(scalar2, scalar2, arrr);

    
    /* Set up a precomputed table with odd multiples of b. */
    pniels_t pn, multiples[NTABLE];
    decaf_448_point_t tmp;
    decaf_448_point_double(tmp, b);
    pt_to_pniels(pn, tmp);
    pt_to_pniels(multiples[0], b);
    decaf_448_point_double(a, b);
    pt_to_pniels(pn, a);
    decaf_448_point_copy(tmp, b);

    int i,j;
    for (i=1; i<NTABLE; i++) {
        add_pniels_to_pt(a, pn);
        pt_to_pniels(multiples[i], a);
        add_pniels_to_pt(tmp, pn, 0);
        pt_to_pniels(multiples[i], tmp);
    }

    i = 448 - (448 % WINDOW);
    int bits = scalar2->limb[i/WBITS] >> (i%WBITS),
        inv = (bits>>((448 % WINDOW)-1))-1;
    /* Initialize. */
    i = DECAF_448_SCALAR_BITS - ((DECAF_448_SCALAR_BITS-1) % WINDOW) - 1;
    int bits = scalar2->limb[i/WBITS] >> (i%WBITS) & WINDOW_MASK,
        inv = (bits>>(WINDOW-1))-1;
    bits ^= inv;
    
    constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_U_MASK);
    constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
    cond_neg_pniels(pn, inv);
    pniels_to_pt(a, pn);
    pniels_to_pt(tmp, pn);

    for (i-=WINDOW; i>=0; i-=WINDOW) {
        for (j=0; j<WINDOW; j++) {
            decaf_448_point_double(a, a);
        }

        /* Using Hisil et al's lookahead method instead of extensible here
         * for no particular reason.  Double WINDOW times, but only compute t on
         * the last one.
         */
        for (j=0; j<WINDOW-1; j++)
            decaf_448_point_double_internal(tmp, tmp, -1);
        decaf_448_point_double(tmp, tmp);

        /* Fetch another block of bits */
        bits = scalar2->limb[i/WBITS] >> (i%WBITS);
        
        if (i%WBITS >= WBITS-WINDOW) {
            bits ^= scalar2->limb[i/WBITS+1] << (WBITS - (i%WBITS));
        }
                
        bits &= WINDOW_MASK;
        inv = (bits>>(WINDOW-1))-1;
        bits ^= inv;
    
        /* Add in from table.  Compute t only on last iteration. */
        constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
        cond_neg_pniels(pn, inv);
        add_pniels_to_pt(a, pn);
        add_pniels_to_pt(tmp, pn, i ? -1 : 0);
    }
    
    /* Write out the answer */
    decaf_448_point_copy(a,tmp);
 }

 void decaf_448_point_double_scalarmul (