You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

1698 lines
47 KiB

  1. /* Copyright (c) 2015 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. /**
  5. * @file decaf.c
  6. * @author Mike Hamburg
  7. * @brief Decaf high-level functions.
  8. */
  9. #define _XOPEN_SOURCE 600 /* for posix_memalign */
  10. #define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
  11. #include "decaf.h"
  12. #include <string.h>
  13. #include "field.h"
  14. #include "decaf_448_config.h"
  15. #define WBITS DECAF_WORD_BITS
  16. /* Rename table for eventual factoring into .c.inc, MSR ECC style */
  17. #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
  18. #define SCALAR_BITS DECAF_448_SCALAR_BITS
  19. #define NLIMBS DECAF_448_LIMBS
  20. #define API_NS(_id) decaf_448_##_id
  21. #define API_NS2(_pref,_id) _pref##_decaf_448_##_id
  22. #define scalar_t decaf_448_scalar_t
  23. #define point_t decaf_448_point_t
  24. #define precomputed_s decaf_448_precomputed_s
  25. #define SER_BYTES DECAF_448_SER_BYTES
  26. #if WBITS == 64
  27. typedef __int128_t decaf_sdword_t;
  28. #define SC_LIMB(x) (x##ull)
  29. #elif WBITS == 32
  30. typedef int64_t decaf_sdword_t;
  31. #define SC_LIMB(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
  32. #else
  33. #error "Only supporting 32- and 64-bit platforms right now"
  34. #endif
  35. //static const int QUADRATIC_NONRESIDUE = -1;
  36. #define sv static void
  37. #define snv static void __attribute__((noinline))
  38. #define siv static inline void __attribute__((always_inline))
  39. static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};
  40. static const int EDWARDS_D = -39081;
  41. static const scalar_t sc_p = {{{
  42. SC_LIMB(0x2378c292ab5844f3),
  43. SC_LIMB(0x216cc2728dc58f55),
  44. SC_LIMB(0xc44edb49aed63690),
  45. SC_LIMB(0xffffffff7cca23e9),
  46. SC_LIMB(0xffffffffffffffff),
  47. SC_LIMB(0xffffffffffffffff),
  48. SC_LIMB(0x3fffffffffffffff)
  49. }}};
  50. const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
  51. extern const scalar_t sc_r2;
  52. extern const decaf_word_t MONTGOMERY_FACTOR;
  53. /* sqrt(5) = 2phi-1 from the curve spec. Not exported, but used by pregen tool. */
  54. const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
  55. -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
  56. };
  57. extern const point_t API_NS(point_base);
  58. /* Projective Niels coordinates */
  59. typedef struct { gf a, b, c; } niels_s, niels_t[1];
  60. typedef struct { niels_t n; gf z; } pniels_s, pniels_t[1];
  61. /* Precomputed base */
  62. struct precomputed_s { niels_t table [DECAF_COMBS_N<<(DECAF_COMBS_T-1)]; };
  63. extern const field_t API_NS(precomputed_base_as_fe)[];
  64. const precomputed_s *API_NS(precomputed_base) =
  65. (const precomputed_s *) &API_NS(precomputed_base_as_fe);
  66. const size_t API_NS2(sizeof,precomputed_s) = sizeof(precomputed_s);
  67. const size_t API_NS2(alignof,precomputed_s) = 32;
  68. #ifdef __clang__
  69. #if 100*__clang_major__ + __clang_minor__ > 305
  70. #define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)")
  71. #endif
  72. #endif
  73. #ifndef VECTORIZE
  74. #define VECTORIZE
  75. #endif
  76. #define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }}
  77. #define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }}
  78. /** Copy x = y */
  79. siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }
  80. /** Mostly-unoptimized multiply, but at least it's unrolled. */
  81. siv gf_mul (gf c, const gf a, const gf b) {
  82. field_mul((field_t *)c, (const field_t *)a, (const field_t *)b);
  83. }
  84. /** Dedicated square */
  85. siv gf_sqr (gf c, const gf a) {
  86. field_sqr((field_t *)c, (const field_t *)a);
  87. }
  88. /** Inverse square root using addition chain. */
  89. siv gf_isqrt(gf y, const gf x) {
  90. field_isr((field_t *)y, (const field_t *)x);
  91. }
  92. /** Inverse. TODO: adapt to 5-mod-8 fields? */
  93. sv gf_invert(gf y, const gf x) {
  94. gf t1, t2;
  95. gf_sqr(t1, x); // o^2
  96. gf_isqrt(t2, t1); // +-1/sqrt(o^2) = +-1/o
  97. gf_sqr(t1, t2);
  98. gf_mul(t2, t1, x); // not direct to y in case of alias.
  99. gf_cpy(y, t2);
  100. }
  101. /** Add mod p. Conservatively always weak-reduce. */
  102. snv gf_add ( gf_s *__restrict__ c, const gf a, const gf b ) {
  103. field_add((field_t *)c, (const field_t *)a, (const field_t *)b);
  104. }
  105. /** Subtract mod p. Conservatively always weak-reduce. */
  106. snv gf_sub ( gf c, const gf a, const gf b ) {
  107. field_sub((field_t *)c, (const field_t *)a, (const field_t *)b);
  108. }
  109. /** Add mod p. Conservatively always weak-reduce.) */
  110. siv gf_bias ( gf c, int amt) {
  111. field_bias((field_t *)c, amt);
  112. }
  113. /** Subtract mod p. Bias by 2 and don't reduce */
  114. siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
  115. // FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
  116. ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
  117. field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
  118. gf_bias(c, 2);
  119. if (WBITS==32) field_weak_reduce((field_t*) c); // HACK FIXME
  120. }
  121. /** Subtract mod p. Bias by amt but don't reduce. */
  122. siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
  123. ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
  124. field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
  125. gf_bias(c, amt);
  126. if (WBITS==32) field_weak_reduce((field_t*) c); // HACK FIXME
  127. }
  128. /** Add mod p. Don't reduce. */
  129. siv gf_add_nr ( gf c, const gf a, const gf b ) {
  130. // FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]);
  131. ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
  132. field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
  133. }
  134. /** Constant time, x = is_z ? z : y */
  135. siv cond_sel(gf x, const gf y, const gf z, decaf_bool_t is_z) {
  136. big_register_t br_mask = br_set_to_mask(is_z);
  137. big_register_t *out = (big_register_t *)x;
  138. const big_register_t *y_ = (const big_register_t *)y, *z_ = (const big_register_t *)z;
  139. word_t k;
  140. for (k=0; k<sizeof(gf)/sizeof(big_register_t); k++) {
  141. out[k] = (~br_mask & y_[k]) | (br_mask & z_[k]);
  142. }
  143. /*
  144. constant_time_select(x,z,y,sizeof(gf),is_z);
  145. */
  146. }
  147. /** Constant time, if (neg) x=-x; */
  148. sv cond_neg(gf x, decaf_bool_t neg) {
  149. gf y;
  150. gf_sub(y,ZERO,x);
  151. cond_sel(x,x,y,neg);
  152. }
  153. /** Constant time, if (swap) (x,y) = (y,x); */
  154. siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
  155. FOR_LIMB_V(i, {
  156. decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
  157. x->limb[i] ^= s;
  158. y->limb[i] ^= s;
  159. });
  160. }
  161. /**
  162. * Mul by signed int. Not constant-time WRT the sign of that int.
  163. * Just uses a full mul (PERF)
  164. */
  165. siv gf_mlw(gf c, const gf a, int w) {
  166. if (w>0) {
  167. field_mulw((field_t *)c, (const field_t *)a, w);
  168. } else {
  169. field_mulw((field_t *)c, (const field_t *)a, -w);
  170. gf_sub(c,ZERO,c);
  171. }
  172. }
  173. /** Canonicalize */
  174. siv gf_canon ( gf a ) {
  175. field_strong_reduce((field_t *)a);
  176. }
  177. /** Compare a==b */
  178. static decaf_word_t __attribute__((noinline)) gf_eq(const gf a, const gf b) {
  179. gf c;
  180. gf_sub(c,a,b);
  181. gf_canon(c);
  182. decaf_word_t ret=0;
  183. FOR_LIMB(i, ret |= c->limb[i] );
  184. /* Hope the compiler is too dumb to optimize this, thus noinline */
  185. return ((decaf_dword_t)ret - 1) >> WBITS;
  186. }
  187. /** Inverse square root using addition chain. */
  188. static decaf_bool_t gf_isqrt_chk(gf y, const gf x, decaf_bool_t allow_zero) {
  189. gf tmp0, tmp1;
  190. field_isr((field_t *)y, (const field_t *)x);
  191. gf_sqr(tmp0,y);
  192. gf_mul(tmp1,tmp0,x);
  193. return gf_eq(tmp1,ONE) | (allow_zero & gf_eq(tmp1,ZERO));
  194. }
  195. /** Return high bit of x = low bit of 2x mod p */
  196. static decaf_word_t hibit(const gf x) {
  197. gf y;
  198. gf_add(y,x,x);
  199. gf_canon(y);
  200. return -(y->limb[0]&1);
  201. }
  202. /** {extra,accum} - sub +? p
  203. * Must have extra <= 1
  204. */
  205. snv sc_subx(
  206. scalar_t out,
  207. const decaf_word_t accum[SCALAR_LIMBS],
  208. const scalar_t sub,
  209. const scalar_t p,
  210. decaf_word_t extra
  211. ) {
  212. decaf_sdword_t chain = 0;
  213. unsigned int i;
  214. for (i=0; i<SCALAR_LIMBS; i++) {
  215. chain = (chain + accum[i]) - sub->limb[i];
  216. out->limb[i] = chain;
  217. chain >>= WBITS;
  218. }
  219. decaf_bool_t borrow = chain+extra; /* = 0 or -1 */
  220. chain = 0;
  221. for (i=0; i<SCALAR_LIMBS; i++) {
  222. chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
  223. out->limb[i] = chain;
  224. chain >>= WBITS;
  225. }
  226. }
  227. snv sc_montmul (
  228. scalar_t out,
  229. const scalar_t a,
  230. const scalar_t b
  231. ) {
  232. unsigned int i,j;
  233. decaf_word_t accum[SCALAR_LIMBS+1] = {0};
  234. decaf_word_t hi_carry = 0;
  235. for (i=0; i<SCALAR_LIMBS; i++) {
  236. decaf_word_t mand = a->limb[i];
  237. const decaf_word_t *mier = b->limb;
  238. decaf_dword_t chain = 0;
  239. for (j=0; j<SCALAR_LIMBS; j++) {
  240. chain += ((decaf_dword_t)mand)*mier[j] + accum[j];
  241. accum[j] = chain;
  242. chain >>= WBITS;
  243. }
  244. accum[j] = chain;
  245. mand = accum[0] * MONTGOMERY_FACTOR;
  246. chain = 0;
  247. mier = sc_p->limb;
  248. for (j=0; j<SCALAR_LIMBS; j++) {
  249. chain += (decaf_dword_t)mand*mier[j] + accum[j];
  250. if (j) accum[j-1] = chain;
  251. chain >>= WBITS;
  252. }
  253. chain += accum[j];
  254. chain += hi_carry;
  255. accum[j-1] = chain;
  256. hi_carry = chain >> WBITS;
  257. }
  258. sc_subx(out, accum, sc_p, sc_p, hi_carry);
  259. }
  260. void API_NS(scalar_mul) (
  261. scalar_t out,
  262. const scalar_t a,
  263. const scalar_t b
  264. ) {
  265. sc_montmul(out,a,b);
  266. sc_montmul(out,out,sc_r2);
  267. }
  268. /* PERF: could implement this */
  269. siv sc_montsqr (
  270. scalar_t out,
  271. const scalar_t a
  272. ) {
  273. sc_montmul(out,a,a);
  274. }
  275. decaf_bool_t API_NS(scalar_invert) (
  276. scalar_t out,
  277. const scalar_t a
  278. ) {
  279. /* FIELD MAGIC */
  280. scalar_t chain[7], tmp;
  281. sc_montmul(chain[0],a,sc_r2);
  282. unsigned int i,j;
  283. /* Addition chain generated by a not-too-clever SAGE script. First part: compute a^(2^222-1) */
  284. const struct { uint8_t widx, sidx, sct, midx; } muls [] = {
  285. {2,0,1,0}, {3,2,1,0}, {4,3,1,0}, {5,4,1,0}, /* 0x3,7,f,1f */
  286. {1,5,1,0}, {1,1,3,3}, {6,1,9,1}, {1,6,1,0}, {6,1,18,6}, /* a^(2^37-1) */
  287. {1,6,37,6}, {1,1,37,6}, {1,1,111,1} /* a^(2^222-1) */
  288. };
  289. /* Second part: sliding window */
  290. const struct { uint8_t sct, midx; } muls1 [] = {
  291. {6, 5}, {4, 2}, {3, 0}, {2, 0}, {4, 0}, {8, 5},
  292. {2, 0}, {5, 3}, {4, 0}, {4, 0}, {5, 3}, {3, 2},
  293. {3, 2}, {3, 2}, {2, 0}, {3, 0}, {4, 2}, {2, 0},
  294. {4, 3}, {3, 2}, {2, 0}, {3, 2}, {5, 2}, {3, 2},
  295. {2, 0}, {3, 0}, {7, 0}, {5, 0}, {3, 2}, {3, 2},
  296. {4, 2}, {5, 0}, {5, 3}, {3, 0}, {2, 0}, {5, 2},
  297. {4, 3}, {4, 0}, {3, 2}, {7, 4}, {2, 0}, {2, 0},
  298. {2, 0}, {2, 0}, {3, 0}, {5, 2}, {5, 4}, {5, 2},
  299. {5, 0}, {2, 0}, {3, 0}, {3, 0}, {2, 0}, {2, 0},
  300. {2, 0}, {3, 2}, {2, 0}, {3, 2}, {5, 0}, {4, 0},
  301. {6, 4}, {4, 0}
  302. };
  303. for (i=0; i<sizeof(muls)/sizeof(muls[0]); i++) {
  304. sc_montsqr(tmp, chain[muls[i].sidx]);
  305. for (j=1; j<muls[i].sct; j++) {
  306. sc_montsqr(tmp, tmp);
  307. }
  308. sc_montmul(chain[muls[i].widx], tmp, chain[muls[i].midx]);
  309. }
  310. for (i=0; i<sizeof(muls1)/sizeof(muls1[0]); i++) {
  311. sc_montsqr(tmp, chain[1]);
  312. for (j=1; j<muls1[i].sct; j++) {
  313. sc_montsqr(tmp, tmp);
  314. }
  315. sc_montmul(chain[1], tmp, chain[muls1[i].midx]);
  316. }
  317. sc_montmul(out,chain[1],API_NS(scalar_one));
  318. for (i=0; i<sizeof(chain)/sizeof(chain[0]); i++) {
  319. API_NS(scalar_destroy)(chain[i]);
  320. }
  321. return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
  322. }
  323. void API_NS(scalar_sub) (
  324. scalar_t out,
  325. const scalar_t a,
  326. const scalar_t b
  327. ) {
  328. sc_subx(out, a->limb, b, sc_p, 0);
  329. }
  330. void API_NS(scalar_add) (
  331. scalar_t out,
  332. const scalar_t a,
  333. const scalar_t b
  334. ) {
  335. decaf_dword_t chain = 0;
  336. unsigned int i;
  337. for (i=0; i<SCALAR_LIMBS; i++) {
  338. chain = (chain + a->limb[i]) + b->limb[i];
  339. out->limb[i] = chain;
  340. chain >>= WBITS;
  341. }
  342. sc_subx(out, out->limb, sc_p, sc_p, chain);
  343. }
  344. snv sc_halve (
  345. scalar_t out,
  346. const scalar_t a,
  347. const scalar_t p
  348. ) {
  349. decaf_word_t mask = -(a->limb[0] & 1);
  350. decaf_dword_t chain = 0;
  351. unsigned int i;
  352. for (i=0; i<SCALAR_LIMBS; i++) {
  353. chain = (chain + a->limb[i]) + (p->limb[i] & mask);
  354. out->limb[i] = chain;
  355. chain >>= WBITS;
  356. }
  357. for (i=0; i<SCALAR_LIMBS-1; i++) {
  358. out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1);
  359. }
  360. out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1);
  361. }
  362. void API_NS(scalar_set) (
  363. scalar_t out,
  364. decaf_word_t w
  365. ) {
  366. memset(out,0,sizeof(scalar_t));
  367. out->limb[0] = w;
  368. }
  369. decaf_bool_t API_NS(scalar_eq) (
  370. const scalar_t a,
  371. const scalar_t b
  372. ) {
  373. decaf_word_t diff = 0;
  374. unsigned int i;
  375. for (i=0; i<SCALAR_LIMBS; i++) {
  376. diff |= a->limb[i] ^ b->limb[i];
  377. }
  378. return (((decaf_dword_t)diff)-1)>>WBITS;
  379. }
  380. /* *** API begins here *** */
  381. /** identity = (0,1) */
  382. const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
  383. static void gf_encode ( unsigned char ser[SER_BYTES], gf a ) {
  384. field_serialize(ser, (field_t *)a);
  385. }
  386. void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
  387. /* Can shave off one mul here; not important but makes consistent with paper */
  388. gf a, b, c, d;
  389. gf_mlw ( a, p->y, 1-EDWARDS_D );
  390. gf_mul ( c, a, p->t );
  391. gf_mul ( a, p->x, p->z );
  392. gf_sub ( d, c, a );
  393. gf_add ( a, p->z, p->y );
  394. gf_sub ( b, p->z, p->y );
  395. gf_mul ( c, b, a );
  396. gf_mlw ( b, c, -EDWARDS_D );
  397. gf_isqrt ( a, b );
  398. gf_mlw ( b, a, -EDWARDS_D );
  399. gf_mul ( c, b, a );
  400. gf_mul ( a, c, d );
  401. gf_add ( d, b, b );
  402. gf_mul ( c, d, p->z );
  403. cond_neg ( b, ~hibit(c) );
  404. gf_mul ( c, b, p->y );
  405. gf_add ( a, a, c );
  406. cond_neg ( a, hibit(a) );
  407. gf_encode(ser, a);
  408. }
  409. /**
  410. * Deserialize a bool, return TRUE if < p.
  411. */
  412. static decaf_bool_t gf_deser(gf s, const unsigned char ser[SER_BYTES]) {
  413. return field_deserialize((field_t *)s, ser);
  414. }
  415. decaf_bool_t API_NS(point_decode) (
  416. point_t p,
  417. const unsigned char ser[SER_BYTES],
  418. decaf_bool_t allow_identity
  419. ) {
  420. gf s, a, b, c, d;
  421. decaf_bool_t succ = gf_deser(s, ser), zero = gf_eq(s, ZERO);
  422. succ &= allow_identity | ~zero;
  423. succ &= ~hibit(s);
  424. gf_sqr ( a, s );
  425. gf_sub ( p->z, ONE, a );
  426. gf_sqr ( b, p->z );
  427. gf_mlw ( c, a, 4-4*EDWARDS_D );
  428. gf_add ( c, c, b );
  429. gf_mul ( b, c, a );
  430. succ &= gf_isqrt_chk ( d, b, DECAF_TRUE );
  431. gf_mul ( b, c, d );
  432. cond_neg ( d, hibit(b) );
  433. gf_add ( p->x, s, s );
  434. gf_mul ( c, d, s );
  435. gf_sub ( b, TWO, p->z );
  436. gf_mul ( a, b, c );
  437. gf_mul ( p->y,a,p->z );
  438. gf_mul ( p->t,p->x,a );
  439. p->y->limb[0] -= zero;
  440. /* TODO: do something safe if ~succ? */
  441. return succ;
  442. }
  443. void API_NS(point_sub) (
  444. point_t p,
  445. const point_t q,
  446. const point_t r
  447. ) {
  448. gf a, b, c, d;
  449. gf_sub_nr ( b, q->y, q->x );
  450. gf_sub_nr ( d, r->y, r->x );
  451. gf_add_nr ( c, r->y, r->x );
  452. gf_mul ( a, c, b );
  453. gf_add_nr ( b, q->y, q->x );
  454. gf_mul ( p->y, d, b );
  455. gf_mul ( b, r->t, q->t );
  456. gf_mlw ( p->x, b, 2-2*EDWARDS_D );
  457. gf_add_nr ( b, a, p->y );
  458. gf_sub_nr ( c, p->y, a );
  459. gf_mul ( a, q->z, r->z );
  460. gf_add_nr ( a, a, a );
  461. gf_sub_nr ( p->y, a, p->x );
  462. gf_add_nr ( a, a, p->x );
  463. gf_mul ( p->z, a, p->y );
  464. gf_mul ( p->x, p->y, c );
  465. gf_mul ( p->y, a, b );
  466. gf_mul ( p->t, b, c );
  467. }
  468. void API_NS(point_add) (
  469. point_t p,
  470. const point_t q,
  471. const point_t r
  472. ) {
  473. gf a, b, c, d;
  474. gf_sub_nr ( b, q->y, q->x );
  475. gf_sub_nr ( c, r->y, r->x );
  476. gf_add_nr ( d, r->y, r->x );
  477. gf_mul ( a, c, b );
  478. gf_add_nr ( b, q->y, q->x );
  479. gf_mul ( p->y, d, b );
  480. gf_mul ( b, r->t, q->t );
  481. gf_mlw ( p->x, b, 2-2*EDWARDS_D );
  482. gf_add_nr ( b, a, p->y );
  483. gf_sub_nr ( c, p->y, a );
  484. gf_mul ( a, q->z, r->z );
  485. gf_add_nr ( a, a, a );
  486. gf_add_nr ( p->y, a, p->x );
  487. gf_sub_nr ( a, a, p->x );
  488. gf_mul ( p->z, a, p->y );
  489. gf_mul ( p->x, p->y, c );
  490. gf_mul ( p->y, a, b );
  491. gf_mul ( p->t, b, c );
  492. }
  493. snv point_double_internal (
  494. point_t p,
  495. const point_t q,
  496. decaf_bool_t before_double
  497. ) {
  498. gf a, b, c, d;
  499. gf_sqr ( c, q->x );
  500. gf_sqr ( a, q->y );
  501. gf_add_nr ( d, c, a );
  502. gf_add_nr ( p->t, q->y, q->x );
  503. gf_sqr ( b, p->t );
  504. gf_sub_nr_x ( b, b, d, 3 );
  505. gf_sub_nr ( p->t, a, c );
  506. gf_sqr ( p->x, q->z );
  507. gf_add_nr ( p->z, p->x, p->x );
  508. gf_sub_nr_x ( a, p->z, p->t, 4 );
  509. gf_mul ( p->x, a, b );
  510. gf_mul ( p->z, p->t, a );
  511. gf_mul ( p->y, p->t, d );
  512. if (!before_double) gf_mul ( p->t, b, d );
  513. }
  514. void API_NS(point_double)(point_t p, const point_t q) {
  515. point_double_internal(p,q,0);
  516. }
  517. void API_NS(point_negate) (
  518. point_t nega,
  519. const point_t a
  520. ) {
  521. gf_sub(nega->x, ZERO, a->x);
  522. gf_cpy(nega->y, a->y);
  523. gf_cpy(nega->z, a->z);
  524. gf_sub(nega->t, ZERO, a->t);
  525. }
  526. siv scalar_decode_short (
  527. scalar_t s,
  528. const unsigned char ser[SER_BYTES],
  529. unsigned int nbytes
  530. ) {
  531. unsigned int i,j,k=0;
  532. for (i=0; i<SCALAR_LIMBS; i++) {
  533. decaf_word_t out = 0;
  534. for (j=0; j<sizeof(decaf_word_t) && k<nbytes; j++,k++) {
  535. out |= ((decaf_word_t)ser[k])<<(8*j);
  536. }
  537. s->limb[i] = out;
  538. }
  539. }
  540. decaf_bool_t API_NS(scalar_decode)(
  541. scalar_t s,
  542. const unsigned char ser[SER_BYTES]
  543. ) {
  544. unsigned int i;
  545. scalar_decode_short(s, ser, SER_BYTES);
  546. decaf_sdword_t accum = 0;
  547. for (i=0; i<SCALAR_LIMBS; i++) {
  548. accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS;
  549. }
  550. API_NS(scalar_mul)(s,s,API_NS(scalar_one)); /* ham-handed reduce */
  551. return accum;
  552. }
  553. void decaf_bzero (
  554. void *s,
  555. size_t size
  556. ) {
  557. #ifdef __STDC_LIB_EXT1__
  558. memset_s(s, size, 0, size);
  559. #else
  560. const size_t sw = sizeof(decaf_word_t);
  561. volatile uint8_t *destroy = (volatile uint8_t *)s;
  562. for (; size && ((uintptr_t)destroy)%sw; size--, destroy++)
  563. *destroy = 0;
  564. for (; size >= sw; size -= sw, destroy += sw)
  565. *(volatile decaf_word_t *)destroy = 0;
  566. for (; size; size--, destroy++)
  567. *destroy = 0;
  568. #endif
  569. }
  570. void API_NS(scalar_destroy) (
  571. scalar_t scalar
  572. ) {
  573. decaf_bzero(scalar, sizeof(scalar_t));
  574. }
  575. static inline void ignore_result ( decaf_bool_t boo ) {
  576. (void)boo;
  577. }
  578. void API_NS(scalar_decode_long)(
  579. scalar_t s,
  580. const unsigned char *ser,
  581. size_t ser_len
  582. ) {
  583. if (ser_len == 0) {
  584. API_NS(scalar_copy)(s, API_NS(scalar_zero));
  585. return;
  586. }
  587. size_t i;
  588. scalar_t t1, t2;
  589. i = ser_len - (ser_len%SER_BYTES);
  590. if (i==ser_len) i -= SER_BYTES;
  591. scalar_decode_short(t1, &ser[i], ser_len-i);
  592. if (ser_len == sizeof(scalar_t)) {
  593. assert(i==0);
  594. /* ham-handed reduce */
  595. API_NS(scalar_mul)(s,t1,API_NS(scalar_one));
  596. API_NS(scalar_destroy)(t1);
  597. return;
  598. }
  599. while (i) {
  600. i -= SER_BYTES;
  601. sc_montmul(t1,t1,sc_r2);
  602. ignore_result( API_NS(scalar_decode)(t2, ser+i) );
  603. API_NS(scalar_add)(t1, t1, t2);
  604. }
  605. API_NS(scalar_copy)(s, t1);
  606. API_NS(scalar_destroy)(t1);
  607. API_NS(scalar_destroy)(t2);
  608. }
  609. void API_NS(scalar_encode)(
  610. unsigned char ser[SER_BYTES],
  611. const scalar_t s
  612. ) {
  613. unsigned int i,j,k=0;
  614. for (i=0; i<SCALAR_LIMBS; i++) {
  615. for (j=0; j<sizeof(decaf_word_t); j++,k++) {
  616. ser[k] = s->limb[i] >> (8*j);
  617. }
  618. }
  619. }
  620. /* Operations on [p]niels */
  621. siv cond_neg_niels (
  622. niels_t n,
  623. decaf_bool_t neg
  624. ) {
  625. cond_swap(n->a, n->b, neg);
  626. cond_neg(n->c, neg);
  627. }
  628. static void pt_to_pniels (
  629. pniels_t b,
  630. const point_t a
  631. ) {
  632. gf_sub ( b->n->a, a->y, a->x );
  633. gf_add ( b->n->b, a->x, a->y );
  634. gf_mlw ( b->n->c, a->t, 2*EDWARDS_D-2 );
  635. gf_add ( b->z, a->z, a->z );
  636. }
  637. static void pniels_to_pt (
  638. point_t e,
  639. const pniels_t d
  640. ) {
  641. gf eu;
  642. gf_add ( eu, d->n->b, d->n->a );
  643. gf_sub ( e->y, d->n->b, d->n->a );
  644. gf_mul ( e->t, e->y, eu);
  645. gf_mul ( e->x, d->z, e->y );
  646. gf_mul ( e->y, d->z, eu );
  647. gf_sqr ( e->z, d->z );
  648. }
  649. snv niels_to_pt (
  650. point_t e,
  651. const niels_t n
  652. ) {
  653. gf_add ( e->y, n->b, n->a );
  654. gf_sub ( e->x, n->b, n->a );
  655. gf_mul ( e->t, e->y, e->x );
  656. gf_cpy ( e->z, ONE );
  657. }
  658. snv add_niels_to_pt (
  659. point_t d,
  660. const niels_t e,
  661. decaf_bool_t before_double
  662. ) {
  663. gf a, b, c;
  664. gf_sub_nr ( b, d->y, d->x );
  665. gf_mul ( a, e->a, b );
  666. gf_add_nr ( b, d->x, d->y );
  667. gf_mul ( d->y, e->b, b );
  668. gf_mul ( d->x, e->c, d->t );
  669. gf_add_nr ( c, a, d->y );
  670. gf_sub_nr ( b, d->y, a );
  671. gf_sub_nr ( d->y, d->z, d->x );
  672. gf_add_nr ( a, d->x, d->z );
  673. gf_mul ( d->z, a, d->y );
  674. gf_mul ( d->x, d->y, b );
  675. gf_mul ( d->y, a, c );
  676. if (!before_double) gf_mul ( d->t, b, c );
  677. }
  678. snv sub_niels_from_pt (
  679. point_t d,
  680. const niels_t e,
  681. decaf_bool_t before_double
  682. ) {
  683. gf a, b, c;
  684. gf_sub_nr ( b, d->y, d->x );
  685. gf_mul ( a, e->b, b );
  686. gf_add_nr ( b, d->x, d->y );
  687. gf_mul ( d->y, e->a, b );
  688. gf_mul ( d->x, e->c, d->t );
  689. gf_add_nr ( c, a, d->y );
  690. gf_sub_nr ( b, d->y, a );
  691. gf_add_nr ( d->y, d->z, d->x );
  692. gf_sub_nr ( a, d->z, d->x );
  693. gf_mul ( d->z, a, d->y );
  694. gf_mul ( d->x, d->y, b );
  695. gf_mul ( d->y, a, c );
  696. if (!before_double) gf_mul ( d->t, b, c );
  697. }
  698. sv add_pniels_to_pt (
  699. point_t p,
  700. const pniels_t pn,
  701. decaf_bool_t before_double
  702. ) {
  703. gf L0;
  704. gf_mul ( L0, p->z, pn->z );
  705. gf_cpy ( p->z, L0 );
  706. add_niels_to_pt( p, pn->n, before_double );
  707. }
  708. sv sub_pniels_from_pt (
  709. point_t p,
  710. const pniels_t pn,
  711. decaf_bool_t before_double
  712. ) {
  713. gf L0;
  714. gf_mul ( L0, p->z, pn->z );
  715. gf_cpy ( p->z, L0 );
  716. sub_niels_from_pt( p, pn->n, before_double );
  717. }
  718. extern const scalar_t API_NS(point_scalarmul_adjustment);
  719. /* TODO: get rid of big_register_t dependencies? */
  720. siv constant_time_lookup_xx (
  721. void *__restrict__ out_,
  722. const void *table_,
  723. decaf_word_t elem_bytes,
  724. decaf_word_t n_table,
  725. decaf_word_t idx
  726. ) {
  727. big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
  728. big_register_t *out = (big_register_t *)out_;
  729. const unsigned char *table = (const unsigned char *)table_;
  730. word_t j,k;
  731. big_register_t br_mask = br_is_zero(big_i);
  732. for (k=0; k<elem_bytes/sizeof(big_register_t); k++)
  733. out[k] = br_mask & *(const big_register_t*)(&table[k*sizeof(big_register_t)]);
  734. big_i-=big_one;
  735. for (j=1; j<n_table; j++, big_i-=big_one) {
  736. br_mask = br_is_zero(big_i);
  737. for (k=0; k<elem_bytes/sizeof(big_register_t); k++) {
  738. out[k] |= br_mask & *(const big_register_t*)(&table[k*sizeof(big_register_t)+j*elem_bytes]);
  739. }
  740. }
  741. }
  742. snv prepare_fixed_window(
  743. pniels_t *multiples,
  744. const point_t b,
  745. int ntable
  746. ) {
  747. point_t tmp;
  748. pniels_t pn;
  749. int i;
  750. point_double_internal(tmp, b, 0);
  751. pt_to_pniels(pn, tmp);
  752. pt_to_pniels(multiples[0], b);
  753. API_NS(point_copy)(tmp, b);
  754. for (i=1; i<ntable; i++) {
  755. add_pniels_to_pt(tmp, pn, 0);
  756. pt_to_pniels(multiples[i], tmp);
  757. }
  758. }
  759. void API_NS(point_scalarmul) (
  760. point_t a,
  761. const point_t b,
  762. const scalar_t scalar
  763. ) {
  764. const int WINDOW = DECAF_WINDOW_BITS,
  765. WINDOW_MASK = (1<<WINDOW)-1,
  766. WINDOW_T_MASK = WINDOW_MASK >> 1,
  767. NTABLE = 1<<(WINDOW-1);
  768. scalar_t scalar1x;
  769. API_NS(scalar_add)(scalar1x, scalar, API_NS(point_scalarmul_adjustment));
  770. sc_halve(scalar1x,scalar1x,sc_p);
  771. /* Set up a precomputed table with odd multiples of b. */
  772. pniels_t pn, multiples[NTABLE];
  773. point_t tmp;
  774. prepare_fixed_window(multiples, b, NTABLE);
  775. /* Initialize. */
  776. int i,j,first=1;
  777. i = SCALAR_BITS - ((SCALAR_BITS-1) % WINDOW) - 1;
  778. for (; i>=0; i-=WINDOW) {
  779. /* Fetch another block of bits */
  780. decaf_word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
  781. if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
  782. bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
  783. }
  784. bits &= WINDOW_MASK;
  785. decaf_word_t inv = (bits>>(WINDOW-1))-1;
  786. bits ^= inv;
  787. /* Add in from table. Compute t only on last iteration. */
  788. constant_time_lookup_xx(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
  789. cond_neg_niels(pn->n, inv);
  790. if (first) {
  791. pniels_to_pt(tmp, pn);
  792. first = 0;
  793. } else {
  794. /* Using Hisil et al's lookahead method instead of extensible here
  795. * for no particular reason. Double WINDOW times, but only compute t on
  796. * the last one.
  797. */
  798. for (j=0; j<WINDOW-1; j++)
  799. point_double_internal(tmp, tmp, -1);
  800. point_double_internal(tmp, tmp, 0);
  801. add_pniels_to_pt(tmp, pn, i ? -1 : 0);
  802. }
  803. }
  804. /* Write out the answer */
  805. API_NS(point_copy)(a,tmp);
  806. }
  807. void API_NS(point_double_scalarmul) (
  808. point_t a,
  809. const point_t b,
  810. const scalar_t scalarb,
  811. const point_t c,
  812. const scalar_t scalarc
  813. ) {
  814. const int WINDOW = DECAF_WINDOW_BITS,
  815. WINDOW_MASK = (1<<WINDOW)-1,
  816. WINDOW_T_MASK = WINDOW_MASK >> 1,
  817. NTABLE = 1<<(WINDOW-1);
  818. scalar_t scalar1x, scalar2x;
  819. API_NS(scalar_add)(scalar1x, scalarb, API_NS(point_scalarmul_adjustment));
  820. sc_halve(scalar1x,scalar1x,sc_p);
  821. API_NS(scalar_add)(scalar2x, scalarc, API_NS(point_scalarmul_adjustment));
  822. sc_halve(scalar2x,scalar2x,sc_p);
  823. /* Set up a precomputed table with odd multiples of b. */
  824. pniels_t pn, multiples1[NTABLE], multiples2[NTABLE];
  825. point_t tmp;
  826. prepare_fixed_window(multiples1, b, NTABLE);
  827. prepare_fixed_window(multiples2, c, NTABLE);
  828. /* Initialize. */
  829. int i,j,first=1;
  830. i = SCALAR_BITS - ((SCALAR_BITS-1) % WINDOW) - 1;
  831. for (; i>=0; i-=WINDOW) {
  832. /* Fetch another block of bits */
  833. decaf_word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
  834. bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
  835. if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
  836. bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
  837. bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
  838. }
  839. bits1 &= WINDOW_MASK;
  840. bits2 &= WINDOW_MASK;
  841. decaf_word_t inv1 = (bits1>>(WINDOW-1))-1;
  842. decaf_word_t inv2 = (bits2>>(WINDOW-1))-1;
  843. bits1 ^= inv1;
  844. bits2 ^= inv2;
  845. /* Add in from table. Compute t only on last iteration. */
  846. constant_time_lookup_xx(pn, multiples1, sizeof(pn), NTABLE, bits1 & WINDOW_T_MASK);
  847. cond_neg_niels(pn->n, inv1);
  848. if (first) {
  849. pniels_to_pt(tmp, pn);
  850. first = 0;
  851. } else {
  852. /* Using Hisil et al's lookahead method instead of extensible here
  853. * for no particular reason. Double WINDOW times, but only compute t on
  854. * the last one.
  855. */
  856. for (j=0; j<WINDOW-1; j++)
  857. point_double_internal(tmp, tmp, -1);
  858. point_double_internal(tmp, tmp, 0);
  859. add_pniels_to_pt(tmp, pn, 0);
  860. }
  861. constant_time_lookup_xx(pn, multiples2, sizeof(pn), NTABLE, bits2 & WINDOW_T_MASK);
  862. cond_neg_niels(pn->n, inv2);
  863. add_pniels_to_pt(tmp, pn, i?-1:0);
  864. }
  865. /* Write out the answer */
  866. API_NS(point_copy)(a,tmp);
  867. }
  868. decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) {
  869. /* equality mod 2-torsion compares x/y */
  870. gf a, b;
  871. gf_mul ( a, p->y, q->x );
  872. gf_mul ( b, q->y, p->x );
  873. return gf_eq(a,b);
  874. }
  875. unsigned char API_NS(point_from_hash_nonuniform) (
  876. point_t p,
  877. const unsigned char ser[SER_BYTES]
  878. ) {
  879. gf r0,r,a,b,c,dee,D,N,rN,e;
  880. decaf_bool_t over = ~gf_deser(r0,ser);
  881. decaf_bool_t sgn_r0 = hibit(r0);
  882. gf_canon(r0);
  883. gf_sqr(a,r0);
  884. gf_sub(r,ZERO,a); /*gf_mlw(r,a,QUADRATIC_NONRESIDUE);*/
  885. gf_mlw(dee,ONE,EDWARDS_D);
  886. gf_mlw(c,r,EDWARDS_D);
  887. /* Compute D := (dr+a-d)(dr-ar-d) with a=1 */
  888. gf_sub(a,c,dee);
  889. gf_add(a,a,ONE);
  890. decaf_bool_t special_identity_case = gf_eq(a,ZERO);
  891. gf_sub(b,c,r);
  892. gf_sub(b,b,dee);
  893. gf_mul(D,a,b);
  894. /* compute N := (r+1)(a-2d) */
  895. gf_add(a,r,ONE);
  896. gf_mlw(N,a,1-2*EDWARDS_D);
  897. /* e = +-1/sqrt(+-ND) */
  898. gf_mul(rN,r,N);
  899. gf_mul(a,rN,D);
  900. decaf_bool_t square = gf_isqrt_chk(e,a,DECAF_FALSE);
  901. decaf_bool_t r_is_zero = gf_eq(r,ZERO);
  902. square |= r_is_zero;
  903. square |= special_identity_case;
  904. /* b <- t/s */
  905. cond_sel(c,r0,r,square); /* r? = sqr ? r : 1 */
  906. /* In two steps to avoid overflow on 32-bit arch */
  907. gf_mlw(a,c,1-2*EDWARDS_D);
  908. gf_mlw(b,a,1-2*EDWARDS_D);
  909. gf_sub(c,r,ONE);
  910. gf_mul(a,b,c); /* = r? * (r-1) * (a-2d)^2 with a=1 */
  911. gf_mul(b,a,e);
  912. cond_neg(b,~square);
  913. cond_sel(c,r0,ONE,square);
  914. gf_mul(a,e,c);
  915. gf_mul(c,a,D); /* 1/s except for sign. FUTURE: simplify using this. */
  916. gf_sub(b,b,c);
  917. /* a <- s = e * N * (sqr ? r : r0)
  918. * e^2 r N D = 1
  919. * 1/s = 1/(e * N * (sqr ? r : r0)) = e * D * (sqr ? 1 : r0)
  920. */
  921. gf_mul(a,N,r0);
  922. cond_sel(rN,a,rN,square);
  923. gf_mul(a,rN,e);
  924. gf_mul(c,a,b);
  925. /* Normalize/negate */
  926. decaf_bool_t neg_s = hibit(a)^~square;
  927. cond_neg(a,neg_s); /* ends up negative if ~square */
  928. decaf_bool_t sgn_t_over_s = hibit(b)^neg_s;
  929. sgn_t_over_s &= ~gf_eq(N,ZERO);
  930. sgn_t_over_s |= gf_eq(D,ZERO);
  931. /* b <- t */
  932. cond_sel(b,c,ONE,gf_eq(c,ZERO)); /* 0,0 -> 1,0 */
  933. /* isogenize */
  934. gf_sqr(c,a); /* s^2 */
  935. gf_add(a,a,a); /* 2s */
  936. gf_add(e,c,ONE);
  937. gf_mul(p->t,a,e); /* 2s(1+s^2) */
  938. gf_mul(p->x,a,b); /* 2st */
  939. gf_sub(a,ONE,c);
  940. gf_mul(p->y,e,a); /* (1+s^2)(1-s^2) */
  941. gf_mul(p->z,a,b); /* (1-s^2)t */
  942. return (~square & 1) | (sgn_t_over_s & 2) | (sgn_r0 & 4) | (over & 8);
  943. }
  944. decaf_bool_t
  945. API_NS(invert_elligator_nonuniform) (
  946. unsigned char recovered_hash[DECAF_448_SER_BYTES],
  947. const point_t p,
  948. unsigned char hint
  949. ) {
  950. decaf_bool_t sgn_s = -(hint & 1),
  951. sgn_t_over_s = -(hint>>1 & 1),
  952. sgn_r0 = -(hint>>2 & 1);
  953. gf a, b, c, d;
  954. gf_mlw ( a, p->y, 1-EDWARDS_D );
  955. gf_mul ( c, a, p->t );
  956. gf_mul ( a, p->x, p->z );
  957. gf_sub ( d, c, a );
  958. gf_add ( a, p->z, p->y );
  959. gf_sub ( b, p->z, p->y );
  960. gf_mul ( c, b, a );
  961. gf_mlw ( b, c, -EDWARDS_D );
  962. gf_isqrt ( a, b );
  963. gf_mlw ( b, a, -EDWARDS_D );
  964. gf_mul ( c, b, a );
  965. gf_mul ( a, c, d );
  966. gf_add ( d, b, b );
  967. gf_mul ( c, d, p->z );
  968. cond_neg ( b, sgn_t_over_s^~hibit(c) );
  969. cond_neg ( c, sgn_t_over_s^~hibit(c) );
  970. gf_mul ( d, b, p->y );
  971. gf_add ( a, a, d );
  972. cond_neg( a, hibit(a)^sgn_s);
  973. /* ok, s = a; c = -t/s */
  974. gf_mul(b,c,a);
  975. gf_sub(b,ONE,b); /* t+1 */
  976. gf_sqr(c,a); /* s^2 */
  977. { /* identity adjustments */
  978. /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
  979. /* if hint is 0, -> 0 */
  980. /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
  981. decaf_bool_t is_identity = gf_eq(p->x,ZERO);
  982. cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
  983. cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */
  984. }
  985. gf_mlw(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
  986. gf_add(a,b,d); /* num? */
  987. gf_sub(d,b,d); /* den? */
  988. gf_mul(b,a,d); /* n*d */
  989. cond_sel(a,d,a,sgn_s);
  990. decaf_bool_t succ = gf_isqrt_chk(c,b,DECAF_TRUE);
  991. gf_mul(b,a,c);
  992. cond_neg(b, sgn_r0^hibit(b));
  993. succ &= ~(gf_eq(b,ZERO) & sgn_r0);
  994. gf_encode(recovered_hash, b);
  995. /* TODO: deal with overflow flag */
  996. return succ;
  997. }
  998. unsigned char API_NS(point_from_hash_uniform) (
  999. point_t pt,
  1000. const unsigned char hashed_data[2*SER_BYTES]
  1001. ) {
  1002. point_t pt2;
  1003. unsigned char ret1 =
  1004. API_NS(point_from_hash_nonuniform)(pt,hashed_data);
  1005. unsigned char ret2 =
  1006. API_NS(point_from_hash_nonuniform)(pt2,&hashed_data[SER_BYTES]);
  1007. API_NS(point_add)(pt,pt,pt2);
  1008. return ret1 | (ret2<<4);
  1009. }
  1010. decaf_bool_t
  1011. API_NS(invert_elligator_uniform) (
  1012. unsigned char partial_hash[2*SER_BYTES],
  1013. const point_t p,
  1014. unsigned char hint
  1015. ) {
  1016. point_t pt2;
  1017. API_NS(point_from_hash_nonuniform)(pt2,&partial_hash[SER_BYTES]);
  1018. API_NS(point_sub)(pt2,p,pt2);
  1019. return API_NS(invert_elligator_nonuniform)(partial_hash,pt2,hint);
  1020. }
  1021. decaf_bool_t API_NS(point_valid) (
  1022. const point_t p
  1023. ) {
  1024. gf a,b,c;
  1025. gf_mul(a,p->x,p->y);
  1026. gf_mul(b,p->z,p->t);
  1027. decaf_bool_t out = gf_eq(a,b);
  1028. gf_sqr(a,p->x);
  1029. gf_sqr(b,p->y);
  1030. gf_sub(a,b,a);
  1031. gf_sqr(b,p->t);
  1032. gf_mlw(c,b,1-EDWARDS_D);
  1033. gf_sqr(b,p->z);
  1034. gf_sub(b,b,c);
  1035. out &= gf_eq(a,b);
  1036. out &= ~gf_eq(p->z,ZERO);
  1037. return out;
  1038. }
  1039. void API_NS(point_debugging_2torque) (
  1040. point_t q,
  1041. const point_t p
  1042. ) {
  1043. gf_sub(q->x,ZERO,p->x);
  1044. gf_sub(q->y,ZERO,p->y);
  1045. gf_cpy(q->z,p->z);
  1046. gf_cpy(q->t,p->t);
  1047. }
  1048. static void gf_batch_invert (
  1049. gf *__restrict__ out,
  1050. /* const */ gf *in,
  1051. unsigned int n
  1052. ) {
  1053. gf t1;
  1054. assert(n>1);
  1055. gf_cpy(out[1], in[0]);
  1056. int i;
  1057. for (i=1; i<(int) (n-1); i++) {
  1058. gf_mul(out[i+1], out[i], in[i]);
  1059. }
  1060. gf_mul(out[0], out[n-1], in[n-1]);
  1061. gf_invert(out[0], out[0]);
  1062. for (i=n-1; i>0; i--) {
  1063. gf_mul(t1, out[i], out[0]);
  1064. gf_cpy(out[i], t1);
  1065. gf_mul(t1, out[0], in[i]);
  1066. gf_cpy(out[0], t1);
  1067. }
  1068. }
  1069. static void batch_normalize_niels (
  1070. niels_t *table,
  1071. gf *zs,
  1072. gf *zis,
  1073. int n
  1074. ) {
  1075. int i;
  1076. gf product;
  1077. gf_batch_invert(zis, zs, n);
  1078. for (i=0; i<n; i++) {
  1079. gf_mul(product, table[i]->a, zis[i]);
  1080. gf_canon(product);
  1081. gf_cpy(table[i]->a, product);
  1082. gf_mul(product, table[i]->b, zis[i]);
  1083. gf_canon(product);
  1084. gf_cpy(table[i]->b, product);
  1085. gf_mul(product, table[i]->c, zis[i]);
  1086. gf_canon(product);
  1087. gf_cpy(table[i]->c, product);
  1088. }
  1089. }
  1090. void API_NS(precompute) (
  1091. precomputed_s *table,
  1092. const point_t base
  1093. ) {
  1094. const unsigned int n = DECAF_COMBS_N, t = DECAF_COMBS_T, s = DECAF_COMBS_S;
  1095. assert(n*t*s >= SCALAR_BITS);
  1096. point_t working, start, doubles[t-1];
  1097. API_NS(point_copy)(working, base);
  1098. pniels_t pn_tmp;
  1099. gf zs[n<<(t-1)], zis[n<<(t-1)];
  1100. unsigned int i,j,k;
  1101. /* Compute n tables */
  1102. for (i=0; i<n; i++) {
  1103. /* Doubling phase */
  1104. for (j=0; j<t; j++) {
  1105. if (j) API_NS(point_add)(start, start, working);
  1106. else API_NS(point_copy)(start, working);
  1107. if (j==t-1 && i==n-1) break;
  1108. point_double_internal(working, working,0);
  1109. if (j<t-1) API_NS(point_copy)(doubles[j], working);
  1110. for (k=0; k<s-1; k++)
  1111. point_double_internal(working, working, k<s-2);
  1112. }
  1113. /* Gray-code phase */
  1114. for (j=0;; j++) {
  1115. int gray = j ^ (j>>1);
  1116. int idx = (((i+1)<<(t-1))-1) ^ gray;
  1117. pt_to_pniels(pn_tmp, start);
  1118. memcpy(table->table[idx], pn_tmp->n, sizeof(pn_tmp->n));
  1119. gf_cpy(zs[idx], pn_tmp->z);
  1120. if (j >= (1u<<(t-1)) - 1) break;
  1121. int delta = (j+1) ^ ((j+1)>>1) ^ gray;
  1122. for (k=0; delta>1; k++)
  1123. delta >>=1;
  1124. if (gray & (1<<k)) {
  1125. API_NS(point_add)(start, start, doubles[k]);
  1126. } else {
  1127. API_NS(point_sub)(start, start, doubles[k]);
  1128. }
  1129. }
  1130. }
  1131. batch_normalize_niels(table->table,zs,zis,n<<(t-1));
  1132. }
  1133. extern const scalar_t API_NS(precomputed_scalarmul_adjustment);
  1134. siv constant_time_lookup_xx_niels (
  1135. niels_s *__restrict__ ni,
  1136. const niels_t *table,
  1137. int nelts,
  1138. int idx
  1139. ) {
  1140. constant_time_lookup_xx(ni, table, sizeof(niels_s), nelts, idx);
  1141. }
  1142. void API_NS(precomputed_scalarmul) (
  1143. point_t out,
  1144. const precomputed_s *table,
  1145. const scalar_t scalar
  1146. ) {
  1147. int i;
  1148. unsigned j,k;
  1149. const unsigned int n = DECAF_COMBS_N, t = DECAF_COMBS_T, s = DECAF_COMBS_S;
  1150. scalar_t scalar1x;
  1151. API_NS(scalar_add)(scalar1x, scalar, API_NS(precomputed_scalarmul_adjustment));
  1152. sc_halve(scalar1x,scalar1x,sc_p);
  1153. niels_t ni;
  1154. for (i=s-1; i>=0; i--) {
  1155. if (i != (int)s-1) point_double_internal(out,out,0);
  1156. for (j=0; j<n; j++) {
  1157. int tab = 0;
  1158. for (k=0; k<t; k++) {
  1159. unsigned int bit = i + s*(k + j*t);
  1160. if (bit < SCALAR_BITS) {
  1161. tab |= (scalar1x->limb[bit/WBITS] >> (bit%WBITS) & 1) << k;
  1162. }
  1163. }
  1164. decaf_bool_t invert = (tab>>(t-1))-1;
  1165. tab ^= invert;
  1166. tab &= (1<<(t-1)) - 1;
  1167. constant_time_lookup_xx_niels(ni, &table->table[j<<(t-1)], 1<<(t-1), tab);
  1168. cond_neg_niels(ni, invert);
  1169. if ((i!=s-1)||j) {
  1170. add_niels_to_pt(out, ni, j==n-1 && i);
  1171. } else {
  1172. niels_to_pt(out, ni);
  1173. }
  1174. }
  1175. }
  1176. }
  1177. #if DECAF_USE_MONTGOMERY_LADDER
  1178. /** Return high bit of x/2 = low bit of x mod p */
  1179. static inline decaf_word_t lobit(gf x) {
  1180. gf_canon(x);
  1181. return -(x->limb[0]&1);
  1182. }
  1183. decaf_bool_t API_NS(direct_scalarmul) (
  1184. uint8_t scaled[SER_BYTES],
  1185. const uint8_t base[SER_BYTES],
  1186. const scalar_t scalar,
  1187. decaf_bool_t allow_identity,
  1188. decaf_bool_t short_circuit
  1189. ) {
  1190. /* The Montgomery ladder does not short-circuit return on invalid points,
  1191. * since it detects them during recompress.
  1192. */
  1193. (void)short_circuit;
  1194. gf s0, x0, xa, za, xd, zd, xs, zs, L0, L1;
  1195. decaf_bool_t succ = gf_deser ( s0, base );
  1196. succ &= allow_identity |~ gf_eq( s0, ZERO);
  1197. /* Prepare the Montgomery ladder: Q = 1:0, P+Q = P */
  1198. gf_sqr ( xa, s0 );
  1199. gf_cpy ( x0, xa );
  1200. gf_cpy ( za, ONE );
  1201. gf_cpy ( xd, ONE );
  1202. gf_cpy ( zd, ZERO );
  1203. int j;
  1204. decaf_bool_t pflip = 0;
  1205. for (j=SCALAR_BITS-1; j>=0; j--) {
  1206. /* Augmented Montgomery ladder */
  1207. decaf_bool_t flip = -((scalar->limb[j/WBITS]>>(j%WBITS))&1);
  1208. /* Differential add first... */
  1209. gf_add_nr ( xs, xa, za );
  1210. gf_sub_nr ( zs, xa, za );
  1211. gf_add_nr ( xa, xd, zd );
  1212. gf_sub_nr ( za, xd, zd );
  1213. cond_sel(L0,xa,xs,flip^pflip);
  1214. cond_sel(L1,za,zs,flip^pflip);
  1215. gf_mul ( xd, xa, zs );
  1216. gf_mul ( zd, xs, za );
  1217. gf_add_nr ( xs, xd, zd );
  1218. gf_sub_nr ( zd, xd, zd );
  1219. gf_mul ( zs, zd, s0 );
  1220. gf_sqr ( xa, xs );
  1221. gf_sqr ( za, zs );
  1222. /* ... and then double */
  1223. gf_sqr ( zd, L0 );
  1224. gf_sqr ( L0, L1 );
  1225. gf_sub_nr ( L1, zd, L0 );
  1226. gf_mul ( xd, L0, zd );
  1227. gf_mlw ( zd, L1, 1-EDWARDS_D );
  1228. gf_add_nr ( L0, L0, zd );
  1229. gf_mul ( zd, L0, L1 );
  1230. pflip = flip;
  1231. }
  1232. cond_swap(xa,xd,pflip);
  1233. cond_swap(za,zd,pflip);
  1234. /* OK, time to reserialize! Should be easy (heh, but seriously, TODO: simplify) */
  1235. gf xz_d, xz_a, xz_s, den, L2, L3;
  1236. mask_t zcase, output_zero, sflip, za_zero;
  1237. gf_mul(xz_s, xs, zs);
  1238. gf_mul(xz_d, xd, zd);
  1239. gf_mul(xz_a, xa, za);
  1240. output_zero = gf_eq(xz_d, ZERO);
  1241. xz_d->limb[0] -= output_zero; /* make xz_d always nonzero */
  1242. zcase = output_zero | gf_eq(xz_a, ZERO);
  1243. za_zero = gf_eq(za, ZERO);
  1244. /* Curve test in zcase, compute x0^2 + (2d-4)x0 + 1
  1245. * (we know that x0 = s0^2 is square).
  1246. */
  1247. gf_add(L0,x0,ONE);
  1248. gf_sqr(L1,L0);
  1249. gf_mlw(L0,x0,-4*EDWARDS_D);
  1250. gf_add(L1,L1,L0);
  1251. cond_sel(xz_a,xz_a,L1,zcase);
  1252. /* Compute denominator = x0 xa za xd zd */
  1253. gf_mul(L0, x0, xz_a);
  1254. gf_mul(L1, L0, xz_d);
  1255. gf_isqrt(den, L1);
  1256. /* Check that the square root came out OK. */
  1257. gf_sqr(L2, den);
  1258. gf_mul(L3, L0, L2); /* x0 xa za den^2 = 1/xz_d, for later */
  1259. gf_mul(L0, L1, L2);
  1260. gf_add(L0, L0, ONE);
  1261. succ &= ~hibit(s0) & ~gf_eq(L0, ZERO);
  1262. /* Compute y/x for input and output point. */
  1263. gf_mul(L1, x0, xd);
  1264. gf_sub(L1, zd, L1);
  1265. gf_mul(L0, za, L1); /* L0 = "opq" */
  1266. gf_mul(L1, x0, zd);
  1267. gf_sub(L1, L1, xd);
  1268. gf_mul(L2, xa, L1); /* L2 = "pqr" */
  1269. gf_sub(L1, L0, L2);
  1270. gf_add(L0, L0, L2);
  1271. gf_mul(L2, L1, den); /* L2 = y0 / x0 */
  1272. gf_mul(L1, L0, den); /* L1 = yO / xO */
  1273. sflip = (lobit(L1) ^ lobit(L2)) | za_zero;
  1274. /* OK, done with y-coordinates */
  1275. /* If xa==0 or za ==0: return 0
  1276. * Else if za == 0: return s0 * (sflip ? zd : xd)^2 * L3
  1277. * Else if zd == 0: return s0 * (sflip ? zd : xd)^2 * L3
  1278. * Else if pflip: return xs * zs * (sflip ? zd : xd) * L3
  1279. * Else: return s0 * xs * zs * (sflip ? zd : xd) * den
  1280. */
  1281. cond_sel(xd, xd, zd, sflip); /* xd = actual xd we care about */
  1282. cond_sel(den,den,L3,pflip|zcase);
  1283. cond_sel(xz_s,xz_s,xd,zcase);
  1284. cond_sel(s0,s0,ONE,pflip&~zcase);
  1285. cond_sel(s0,s0,ZERO,output_zero);
  1286. gf_mul(L0,xd,den);
  1287. gf_mul(L1,L0,s0);
  1288. gf_mul(L0,L1,xz_s);
  1289. cond_neg(L0,hibit(L0));
  1290. gf_encode(scaled, L0);
  1291. return succ;
  1292. }
  1293. #else /* DECAF_USE_MONTGOMERY_LADDER */
  1294. decaf_bool_t API_NS(direct_scalarmul) (
  1295. uint8_t scaled[SER_BYTES],
  1296. const uint8_t base[SER_BYTES],
  1297. const scalar_t scalar,
  1298. decaf_bool_t allow_identity,
  1299. decaf_bool_t short_circuit
  1300. ) {
  1301. point_t basep;
  1302. decaf_bool_t succ = API_NS(point_decode)(basep, base, allow_identity);
  1303. if (short_circuit & ~succ) return succ;
  1304. API_NS(point_scalarmul)(basep, basep, scalar);
  1305. API_NS(point_encode)(scaled, basep);
  1306. return succ;
  1307. }
  1308. #endif /* DECAF_USE_MONTGOMERY_LADDER */
  1309. /**
  1310. * @cond internal
  1311. * Control for variable-time scalar multiply algorithms.
  1312. */
  1313. struct smvt_control {
  1314. int power, addend;
  1315. };
  1316. static int recode_wnaf (
  1317. struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
  1318. const scalar_t scalar,
  1319. unsigned int tableBits
  1320. ) {
  1321. int current = 0, i, j;
  1322. unsigned int position = 0;
  1323. /* PERF: negate scalar if it's large
  1324. * PERF: this is a pretty simplistic algorithm. I'm sure there's a faster one...
  1325. * PERF MINOR: not technically WNAF, since last digits can be adjacent. Could be rtl.
  1326. */
  1327. for (i=SCALAR_BITS-1; i >= 0; i--) {
  1328. int bit = (scalar->limb[i/WORD_BITS] >> (i%WORD_BITS)) & 1;
  1329. current = 2*current + bit;
  1330. /*
  1331. * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
  1332. * So current loses (tableBits+1) bits every time. It otherwise gains
  1333. * 1 bit per iteration. The number of iterations is
  1334. * (nbits + 2 + tableBits), and an additional control word is added at
  1335. * the end. So the total number of control words is at most
  1336. * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
  1337. * There's also the stopper with power -1, for a total of +3.
  1338. */
  1339. if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
  1340. int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */
  1341. current = -(current & 1);
  1342. for (j=i; (delta & 1) == 0; j++) {
  1343. delta >>= 1;
  1344. }
  1345. control[position].power = j+1;
  1346. control[position].addend = delta;
  1347. position++;
  1348. assert(position <= SCALAR_BITS/(tableBits+1) + 2);
  1349. }
  1350. }
  1351. if (current) {
  1352. for (j=0; (current & 1) == 0; j++) {
  1353. current >>= 1;
  1354. }
  1355. control[position].power = j;
  1356. control[position].addend = current;
  1357. position++;
  1358. assert(position <= SCALAR_BITS/(tableBits+1) + 2);
  1359. }
  1360. control[position].power = -1;
  1361. control[position].addend = 0;
  1362. return position;
  1363. }
  1364. sv prepare_wnaf_table(
  1365. pniels_t *output,
  1366. const point_t working,
  1367. unsigned int tbits
  1368. ) {
  1369. point_t tmp;
  1370. int i;
  1371. pt_to_pniels(output[0], working);
  1372. if (tbits == 0) return;
  1373. API_NS(point_double)(tmp,working);
  1374. pniels_t twop;
  1375. pt_to_pniels(twop, tmp);
  1376. add_pniels_to_pt(tmp, output[0],0);
  1377. pt_to_pniels(output[1], tmp);
  1378. for (i=2; i < 1<<tbits; i++) {
  1379. add_pniels_to_pt(tmp, twop,0);
  1380. pt_to_pniels(output[i], tmp);
  1381. }
  1382. }
  1383. extern const field_t API_NS(precomputed_wnaf_as_fe)[];
  1384. static const niels_t *API_NS(wnaf_base) = (const niels_t *)API_NS(precomputed_wnaf_as_fe);
  1385. const size_t API_NS2(sizeof,precomputed_wnafs) __attribute((visibility("hidden")))
  1386. = sizeof(niels_t)<<DECAF_WNAF_FIXED_TABLE_BITS;
  1387. void API_NS(precompute_wnafs) (
  1388. niels_t out[1<<DECAF_WNAF_FIXED_TABLE_BITS],
  1389. const point_t base
  1390. ) __attribute__ ((visibility ("hidden")));
  1391. void API_NS(precompute_wnafs) (
  1392. niels_t out[1<<DECAF_WNAF_FIXED_TABLE_BITS],
  1393. const point_t base
  1394. ) {
  1395. pniels_t tmp[1<<DECAF_WNAF_FIXED_TABLE_BITS];
  1396. gf zs[1<<DECAF_WNAF_FIXED_TABLE_BITS], zis[1<<DECAF_WNAF_FIXED_TABLE_BITS];
  1397. int i;
  1398. prepare_wnaf_table(tmp,base,DECAF_WNAF_FIXED_TABLE_BITS);
  1399. for (i=0; i<1<<DECAF_WNAF_FIXED_TABLE_BITS; i++) {
  1400. memcpy(out[i], tmp[i]->n, sizeof(niels_t));
  1401. gf_cpy(zs[i], tmp[i]->z);
  1402. }
  1403. batch_normalize_niels(out, zs, zis, 1<<DECAF_WNAF_FIXED_TABLE_BITS);
  1404. }
  1405. void API_NS(base_double_scalarmul_non_secret) (
  1406. point_t combo,
  1407. const scalar_t scalar1,
  1408. const point_t base2,
  1409. const scalar_t scalar2
  1410. ) {
  1411. const int table_bits_var = DECAF_WNAF_VAR_TABLE_BITS,
  1412. table_bits_pre = DECAF_WNAF_FIXED_TABLE_BITS;
  1413. struct smvt_control control_var[SCALAR_BITS/(table_bits_var+1)+3];
  1414. struct smvt_control control_pre[SCALAR_BITS/(table_bits_pre+1)+3];
  1415. int ncb_pre = recode_wnaf(control_pre, scalar1, table_bits_pre);
  1416. int ncb_var = recode_wnaf(control_var, scalar2, table_bits_var);
  1417. pniels_t precmp_var[1<<table_bits_var];
  1418. prepare_wnaf_table(precmp_var, base2, table_bits_var);
  1419. int contp=0, contv=0, i = control_var[0].power;
  1420. if (i < 0) {
  1421. API_NS(point_copy)(combo, API_NS(point_identity));
  1422. return;
  1423. } else if (i > control_pre[0].power) {
  1424. pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]);
  1425. contv++;
  1426. } else if (i == control_pre[0].power && i >=0 ) {
  1427. pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]);
  1428. add_niels_to_pt(combo, API_NS(wnaf_base)[control_pre[0].addend >> 1], i);
  1429. contv++; contp++;
  1430. } else {
  1431. i = control_pre[0].power;
  1432. niels_to_pt(combo, API_NS(wnaf_base)[control_pre[0].addend >> 1]);
  1433. contp++;
  1434. }
  1435. for (i--; i >= 0; i--) {
  1436. int cv = (i==control_var[contv].power), cp = (i==control_pre[contp].power);
  1437. point_double_internal(combo,combo,i && !(cv||cp));
  1438. if (cv) {
  1439. assert(control_var[contv].addend);
  1440. if (control_var[contv].addend > 0) {
  1441. add_pniels_to_pt(combo, precmp_var[control_var[contv].addend >> 1], i&&!cp);
  1442. } else {
  1443. sub_pniels_from_pt(combo, precmp_var[(-control_var[contv].addend) >> 1], i&&!cp);
  1444. }
  1445. contv++;
  1446. }
  1447. if (cp) {
  1448. assert(control_pre[contp].addend);
  1449. if (control_pre[contp].addend > 0) {
  1450. add_niels_to_pt(combo, API_NS(wnaf_base)[control_pre[contp].addend >> 1], i);
  1451. } else {
  1452. sub_niels_from_pt(combo, API_NS(wnaf_base)[(-control_pre[contp].addend) >> 1], i);
  1453. }
  1454. contp++;
  1455. }
  1456. }
  1457. assert(contv == ncb_var); (void)ncb_var;
  1458. assert(contp == ncb_pre); (void)ncb_pre;
  1459. }
  1460. void API_NS(point_destroy) (
  1461. point_t point
  1462. ) {
  1463. decaf_bzero(point, sizeof(point_t));
  1464. }
  1465. decaf_bool_t decaf_memeq (
  1466. const void *data1_,
  1467. const void *data2_,
  1468. size_t size
  1469. ) {
  1470. const unsigned char *data1 = (const unsigned char *)data1_;
  1471. const unsigned char *data2 = (const unsigned char *)data2_;
  1472. unsigned char ret = 0;
  1473. for (; size; size--, data1++, data2++) {
  1474. ret |= *data1 ^ *data2;
  1475. }
  1476. return (((decaf_dword_t)ret) - 1) >> 8;
  1477. }
  1478. void API_NS(precomputed_destroy) (
  1479. precomputed_s *pre
  1480. ) {
  1481. decaf_bzero(pre, API_NS2(sizeof,precomputed_s));
  1482. }