You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

936 lines
27 KiB

  1. /* Copyright (c) 2014 Cryptography Research, Inc.
  2. * Released under the MIT License. See LICENSE.txt for license information.
  3. */
  4. #include "word.h"
  5. #include <stdlib.h>
  6. #include <limits.h>
  7. #include <string.h>
  8. #include "intrinsics.h"
  9. #include "scalarmul.h"
  10. #include "barrett_field.h"
  11. mask_t
  12. montgomery_ladder (
  13. struct field_t *out,
  14. const struct field_t *in,
  15. const word_t *scalar,
  16. unsigned int nbits,
  17. unsigned int n_extra_doubles
  18. ) {
  19. struct montgomery_t mont;
  20. deserialize_montgomery(&mont, in);
  21. int i,j,n=(nbits-1)%WORD_BITS;
  22. mask_t pflip = 0;
  23. for (j=(nbits+WORD_BITS-1)/WORD_BITS-1; j>=0; j--) {
  24. word_t w = scalar[j];
  25. for (i=n; i>=0; i--) {
  26. mask_t flip = -((w>>i)&1);
  27. field_cond_swap(&mont.xa,&mont.xd,flip^pflip);
  28. field_cond_swap(&mont.za,&mont.zd,flip^pflip);
  29. montgomery_step(&mont);
  30. pflip = flip;
  31. }
  32. n = WORD_BITS-1;
  33. }
  34. field_cond_swap(&mont.xa,&mont.xd,pflip);
  35. field_cond_swap(&mont.za,&mont.zd,pflip);
  36. assert(n_extra_doubles < INT_MAX);
  37. for (j=0; j<(int)n_extra_doubles; j++) {
  38. montgomery_step(&mont);
  39. }
  40. return serialize_montgomery(out, &mont, in);
  41. }
  42. static __inline__ void
  43. cond_negate_tw_niels (
  44. struct tw_niels_t *n,
  45. mask_t doNegate
  46. ) {
  47. field_cond_swap(&n->a, &n->b, doNegate);
  48. field_cond_neg(&n->c, doNegate);
  49. }
  50. static __inline__ void
  51. cond_negate_tw_pniels (
  52. struct tw_pniels_t *n,
  53. mask_t doNegate
  54. ) {
  55. cond_negate_tw_niels(&n->n, doNegate);
  56. }
  57. #if (defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__) && !defined(__AVX2__))
  58. /* This works around an apparent compiler bug in GCC, thanks Samuel Neves */
  59. static void __attribute__((optimize("O1")))
  60. #ifdef __OPTIMIZE_SIZE__
  61. #warning "There's a bug in here somewhere with GCC -Os on non-AVX2 platforms"
  62. #endif
  63. #else
  64. static __inline__ void
  65. #endif
  66. constant_time_lookup_tw_pniels (
  67. struct tw_pniels_t *out,
  68. const struct tw_pniels_t *in,
  69. int nin,
  70. int idx
  71. ) {
  72. big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
  73. big_register_t *o = (big_register_t *)out;
  74. const big_register_t *i = (const big_register_t *)in;
  75. int j;
  76. unsigned int k;
  77. really_memset(out, 0, sizeof(*out));
  78. for (j=0; j<nin; j++, big_i-=big_one) {
  79. big_register_t mask = br_is_zero(big_i);
  80. for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
  81. o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
  82. }
  83. }
  84. }
  85. static __inline__ void
  86. constant_time_lookup_tw_niels (
  87. struct tw_niels_t *out,
  88. const struct tw_niels_t *in,
  89. int nin,
  90. int idx
  91. ) {
  92. big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
  93. big_register_t *o = (big_register_t *)out;
  94. const big_register_t *i = (const big_register_t *)in;
  95. int j;
  96. unsigned int k;
  97. really_memset(out, 0, sizeof(*out));
  98. for (j=0; j<nin; j++, big_i-=big_one) {
  99. big_register_t mask = br_is_zero(big_i);
  100. for (k=0; k<sizeof(*out)/sizeof(*o); k++) {
  101. o[k] |= mask & i[k+j*sizeof(*out)/sizeof(*o)];
  102. }
  103. }
  104. }
  105. static void
  106. convert_to_signed_window_form (
  107. word_t *out,
  108. const word_t *scalar,
  109. int nwords_scalar,
  110. const word_t *prepared_data,
  111. int nwords_pd
  112. ) {
  113. assert(nwords_pd <= nwords_scalar);
  114. mask_t mask = -(scalar[0]&1);
  115. word_t carry = add_nr_ext_packed(out, scalar, nwords_scalar, prepared_data, nwords_pd, ~mask);
  116. carry += add_nr_ext_packed(out, out, nwords_scalar, prepared_data+nwords_pd, nwords_pd, mask);
  117. assert(!(out[0]&1));
  118. int i;
  119. for (i=0; i<nwords_scalar; i++) {
  120. out[i] >>= 1;
  121. if (i<nwords_scalar-1) {
  122. out[i] |= out[i+1]<<(WORD_BITS-1);
  123. } else {
  124. out[i] |= carry<<(WORD_BITS-1);
  125. }
  126. }
  127. }
  128. void
  129. scalarmul (
  130. struct tw_extensible_t *working,
  131. const word_t scalar[SCALAR_WORDS]
  132. ) {
  133. const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
  134. WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
  135. NTABLE = 1<<(WINDOW-1),
  136. nbits = ROUND_UP(SCALAR_BITS,WINDOW);
  137. word_t scalar2[SCALAR_WORDS];
  138. convert_to_signed_window_form (
  139. scalar2, scalar, SCALAR_WORDS,
  140. SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
  141. );
  142. struct tw_extensible_t tabulator;
  143. copy_tw_extensible(&tabulator, working);
  144. double_tw_extensible(&tabulator);
  145. struct tw_pniels_t pn, multiples[NTABLE];
  146. convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
  147. convert_tw_extensible_to_tw_pniels(&multiples[0], working);
  148. int i,j;
  149. for (i=1; i<NTABLE; i++) {
  150. add_tw_pniels_to_tw_extensible(working, &pn);
  151. convert_tw_extensible_to_tw_pniels(&multiples[i], working);
  152. }
  153. i = nbits - WINDOW;
  154. int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
  155. inv = (bits>>(WINDOW-1))-1;
  156. bits ^= inv;
  157. constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
  158. cond_negate_tw_pniels(&pn, inv);
  159. convert_tw_pniels_to_tw_extensible(working, &pn);
  160. for (i-=WINDOW; i>=0; i-=WINDOW) {
  161. for (j=0; j<WINDOW; j++) {
  162. double_tw_extensible(working);
  163. }
  164. bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
  165. if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
  166. bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
  167. }
  168. bits &= WINDOW_MASK;
  169. inv = (bits>>(WINDOW-1))-1;
  170. bits ^= inv;
  171. constant_time_lookup_tw_pniels(&pn, multiples, NTABLE, bits & WINDOW_T_MASK);
  172. cond_negate_tw_pniels(&pn, inv);
  173. add_tw_pniels_to_tw_extensible(working, &pn);
  174. }
  175. }
  176. void
  177. scalarmul_vlook (
  178. struct tw_extensible_t *working,
  179. const word_t scalar[SCALAR_WORDS]
  180. ) {
  181. const int WINDOW = SCALARMUL_FIXED_WINDOW_SIZE,
  182. WINDOW_MASK = (1<<WINDOW)-1, WINDOW_T_MASK = WINDOW_MASK >> 1,
  183. NTABLE = 1<<(WINDOW-1),
  184. nbits = ROUND_UP(SCALAR_BITS,WINDOW);
  185. word_t scalar2[SCALAR_WORDS];
  186. convert_to_signed_window_form(
  187. scalar2, scalar, SCALAR_WORDS,
  188. SCALARMUL_FIXED_WINDOW_ADJUSTMENT, SCALAR_WORDS
  189. );
  190. struct tw_extensible_t tabulator;
  191. copy_tw_extensible(&tabulator, working);
  192. double_tw_extensible(&tabulator);
  193. struct tw_pniels_t pn, multiples[NTABLE];
  194. convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
  195. convert_tw_extensible_to_tw_pniels(&multiples[0], working);
  196. int i,j;
  197. for (i=1; i<NTABLE; i++) {
  198. add_tw_pniels_to_tw_extensible(working, &pn);
  199. convert_tw_extensible_to_tw_pniels(&multiples[i], working);
  200. }
  201. i = nbits - WINDOW;
  202. int bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS) & WINDOW_MASK,
  203. inv = (bits>>(WINDOW-1))-1;
  204. bits ^= inv;
  205. copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
  206. cond_negate_tw_pniels(&pn, inv);
  207. convert_tw_pniels_to_tw_extensible(working, &pn);
  208. for (i-=WINDOW; i>=0; i-=WINDOW) {
  209. for (j=0; j<WINDOW; j++) {
  210. double_tw_extensible(working);
  211. }
  212. bits = scalar2[i/WORD_BITS] >> (i%WORD_BITS);
  213. if (i/WORD_BITS < SCALAR_WORDS-1 && i%WORD_BITS >= WORD_BITS-WINDOW) {
  214. bits ^= scalar2[i/WORD_BITS+1] << (WORD_BITS - (i%WORD_BITS));
  215. }
  216. bits &= WINDOW_MASK;
  217. inv = (bits>>(WINDOW-1))-1;
  218. bits ^= inv;
  219. copy_tw_pniels(&pn, &multiples[bits & WINDOW_T_MASK]);
  220. cond_negate_tw_pniels(&pn, inv);
  221. add_tw_pniels_to_tw_extensible(working, &pn);
  222. }
  223. }
  224. static mask_t
  225. schedule_scalar_for_combs (
  226. word_t *scalar2,
  227. const word_t *scalar,
  228. unsigned int nbits,
  229. const struct fixed_base_table_t *table
  230. ) {
  231. unsigned int i;
  232. unsigned int n = table->n, t = table->t, s = table->s;
  233. if (n*t*s < nbits || n < 1 || t < 1 || s < 1) {
  234. return MASK_FAILURE;
  235. }
  236. unsigned int scalar_words = (nbits + WORD_BITS - 1)/WORD_BITS,
  237. scalar2_words = scalar_words;
  238. if (scalar2_words < SCALAR_WORDS)
  239. scalar2_words = SCALAR_WORDS;
  240. word_t scalar3[scalar2_words];
  241. /* Copy scalar to scalar3, but clear its high bits (if there are any) */
  242. for (i=0; i<scalar_words; i++) {
  243. scalar3[i] = scalar[i];
  244. }
  245. if (likely(i) && (nbits % WORD_BITS)) {
  246. scalar3[i-1] &= (((word_t)1) << (nbits%WORD_BITS)) - 1;
  247. }
  248. for (; i<scalar2_words; i++) {
  249. scalar3[i] = 0;
  250. }
  251. convert_to_signed_window_form (
  252. scalar2,
  253. scalar3, scalar2_words,
  254. table->scalar_adjustments , SCALAR_WORDS
  255. );
  256. return MASK_SUCCESS;
  257. }
  258. mask_t
  259. scalarmul_fixed_base (
  260. struct tw_extensible_t *out,
  261. const word_t scalar[SCALAR_WORDS],
  262. unsigned int nbits,
  263. const struct fixed_base_table_t *table
  264. ) {
  265. unsigned int i,j,k;
  266. unsigned int n = table->n, t = table->t, s = table->s;
  267. unsigned int scalar2_words = (nbits + WORD_BITS - 1)/WORD_BITS;
  268. if (scalar2_words < SCALAR_WORDS) scalar2_words = SCALAR_WORDS;
  269. word_t scalar2[scalar2_words];
  270. mask_t succ = schedule_scalar_for_combs(scalar2, scalar, nbits, table);
  271. if (!succ) return MASK_FAILURE;
  272. #ifdef __clang_analyzer__
  273. assert(t >= 1);
  274. #endif
  275. struct tw_niels_t ni;
  276. for (i=0; i<s; i++) {
  277. if (i) double_tw_extensible(out);
  278. for (j=0; j<n; j++) {
  279. int tab = 0;
  280. /*
  281. * PERF: This computation takes about 1.5µs on SBR, i.e. 2-3% of the
  282. * time of a keygen or sign op. Surely it is possible to speed it up.
  283. */
  284. for (k=0; k<t; k++) {
  285. unsigned int bit = (s-1-i) + k*s + j*(s*t);
  286. if (bit < scalar2_words * WORD_BITS) {
  287. tab |= (scalar2[bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
  288. }
  289. }
  290. mask_t invert = (tab>>(t-1))-1;
  291. tab ^= invert;
  292. tab &= (1<<(t-1)) - 1;
  293. constant_time_lookup_tw_niels(&ni, table->table + (j<<(t-1)), 1<<(t-1), tab);
  294. cond_negate_tw_niels(&ni, invert);
  295. if (i||j) {
  296. add_tw_niels_to_tw_extensible(out, &ni);
  297. } else {
  298. convert_tw_niels_to_tw_extensible(out, &ni);
  299. }
  300. }
  301. }
  302. return MASK_SUCCESS;
  303. }
  304. mask_t
  305. linear_combo_combs_vt (
  306. struct tw_extensible_t *out,
  307. const word_t scalar1[SCALAR_WORDS],
  308. unsigned int nbits1,
  309. const struct fixed_base_table_t *table1,
  310. const word_t scalar2[SCALAR_WORDS],
  311. unsigned int nbits2,
  312. const struct fixed_base_table_t *table2
  313. ) {
  314. unsigned int i,j,k,sc;
  315. unsigned int s1 = table1->s, s2 = table2->s, smax = (s1 > s2) ? s1 : s2;
  316. unsigned int scalar1b_words = (nbits1 + WORD_BITS - 1)/WORD_BITS;
  317. if (scalar1b_words < SCALAR_WORDS) scalar1b_words = SCALAR_WORDS;
  318. unsigned int scalar2b_words = (nbits2 + WORD_BITS - 1)/WORD_BITS;
  319. if (scalar2b_words < SCALAR_WORDS) scalar2b_words = SCALAR_WORDS;
  320. word_t scalar1b[scalar1b_words], scalar2b[scalar2b_words];
  321. /* Schedule the scalars */
  322. mask_t succ;
  323. succ = schedule_scalar_for_combs(scalar1b, scalar1, nbits1, table1);
  324. if (!succ) return MASK_FAILURE;
  325. succ = schedule_scalar_for_combs(scalar2b, scalar2, nbits2, table2);
  326. if (!succ) return MASK_FAILURE;
  327. #ifdef __clang_analyzer__
  328. assert(table1->t >= 1);
  329. assert(table2->t >= 1);
  330. #endif
  331. struct tw_niels_t ni;
  332. unsigned int swords[2] = {scalar1b_words, scalar2b_words}, started = 0;
  333. word_t *scalars[2] = {scalar1b,scalar2b};
  334. for (i=0; i<smax; i++) {
  335. if (i) double_tw_extensible(out);
  336. for (sc=0; sc<2; sc++) {
  337. const struct fixed_base_table_t *table = sc ? table2 : table1;
  338. int ii = i-smax+table->s;
  339. if (ii < 0) continue;
  340. assert(ii < (int)table->s);
  341. for (j=0; j<table->n; j++) {
  342. int tab = 0;
  343. for (k=0; k<table->t; k++) {
  344. unsigned int bit = (table->s-1-ii) + k*table->s + j*(table->s*table->t);
  345. if (bit < swords[sc] * WORD_BITS) {
  346. tab |= (scalars[sc][bit/WORD_BITS] >> (bit%WORD_BITS) & 1) << k;
  347. }
  348. }
  349. mask_t invert = (tab>>(table->t-1))-1;
  350. tab ^= invert;
  351. tab &= (1<<(table->t-1)) - 1;
  352. copy_tw_niels(&ni, &table->table[tab + (j<<(table->t-1))]);
  353. cond_negate_tw_niels(&ni,invert);
  354. if (started) {
  355. add_tw_niels_to_tw_extensible(out, &ni);
  356. } else {
  357. convert_tw_niels_to_tw_extensible(out, &ni);
  358. started = 1;
  359. }
  360. }
  361. }
  362. assert(started);
  363. }
  364. return MASK_SUCCESS;
  365. }
  366. mask_t
  367. precompute_fixed_base (
  368. struct fixed_base_table_t *out,
  369. const struct tw_extensible_t *base,
  370. unsigned int n,
  371. unsigned int t,
  372. unsigned int s,
  373. struct tw_niels_t *prealloc
  374. ) {
  375. if (s < 1 || t < 1 || n < 1 || n*t*s < SCALAR_BITS) {
  376. really_memset(out, 0, sizeof(*out));
  377. return 0;
  378. }
  379. out->n = n;
  380. out->t = t;
  381. out->s = s;
  382. struct tw_extensible_t working, start;
  383. copy_tw_extensible(&working, base);
  384. struct tw_pniels_t pn_tmp;
  385. struct tw_pniels_t *doubles = (struct tw_pniels_t *) malloc_vector(sizeof(*doubles) * (t-1));
  386. struct field_t *zs = (struct field_t *) malloc_vector(sizeof(*zs) * (n<<(t-1)));
  387. struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis) * (n<<(t-1)));
  388. struct tw_niels_t *table = prealloc;
  389. if (prealloc) {
  390. out->own_table = 0;
  391. } else {
  392. table = (struct tw_niels_t *) malloc_vector(sizeof(*table) * (n<<(t-1)));
  393. out->own_table = 1;
  394. }
  395. out->table = table;
  396. if (!doubles || !zs || !zis || !table) {
  397. free(doubles);
  398. free(zs);
  399. free(zis);
  400. really_memset(out, 0, sizeof(*out));
  401. really_memset(table, 0, sizeof(*table) * (n<<(t-1)));
  402. if (!prealloc) free(table);
  403. return 0;
  404. }
  405. unsigned int i,j,k;
  406. /* Compute the scalar adjustments, equal to 2^nbits-1 mod q */
  407. unsigned int adjustment_size = (n*t*s)/WORD_BITS + 1;
  408. assert(adjustment_size >= SCALAR_WORDS);
  409. word_t adjustment[adjustment_size];
  410. for (i=0; i<adjustment_size; i++) {
  411. adjustment[i] = -1;
  412. }
  413. adjustment[(n*t*s) / WORD_BITS] += ((word_t)1) << ((n*t*s) % WORD_BITS);
  414. /* The low adjustment is 2^nbits - 1 mod q */
  415. barrett_reduce(adjustment, adjustment_size, 0, &curve_prime_order);
  416. word_t *low_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*(adjustment[0] & 1)],
  417. *high_adjustment = &out->scalar_adjustments[(SCALAR_WORDS)*((~adjustment[0]) & 1)];
  418. for (i=0; i<SCALAR_WORDS; i++) {
  419. low_adjustment[i] = adjustment[i];
  420. }
  421. /* The high adjustment is low + q = low - q_lo + 2^big */
  422. (void)
  423. sub_nr_ext_packed(
  424. high_adjustment,
  425. adjustment, SCALAR_WORDS,
  426. curve_prime_order.p_lo, curve_prime_order.nwords_lo,
  427. -1
  428. );
  429. if (curve_prime_order.p_shift) {
  430. high_adjustment[curve_prime_order.nwords_p - 1] += ((word_t)1)<<curve_prime_order.p_shift;
  431. }
  432. /* OK, now compute the tables */
  433. for (i=0; i<n; i++) {
  434. /* doubling phase */
  435. for (j=0; j<t; j++) {
  436. if (j) {
  437. convert_tw_extensible_to_tw_pniels(&pn_tmp, &working);
  438. add_tw_pniels_to_tw_extensible(&start, &pn_tmp);
  439. } else {
  440. copy_tw_extensible(&start, &working);
  441. }
  442. if (j==t-1 && i==n-1) {
  443. break;
  444. }
  445. double_tw_extensible(&working);
  446. if (j<t-1) {
  447. convert_tw_extensible_to_tw_pniels(&doubles[j], &working);
  448. }
  449. for (k=0; k<s-1; k++) {
  450. double_tw_extensible(&working);
  451. }
  452. }
  453. /* Gray-code phase */
  454. for (j=0;; j++) {
  455. int gray = j ^ (j>>1);
  456. int idx = (((i+1)<<(t-1))-1) ^ gray;
  457. convert_tw_extensible_to_tw_pniels(&pn_tmp, &start);
  458. copy_tw_niels(&table[idx], &pn_tmp.n);
  459. field_copy(&zs[idx], &pn_tmp.z);
  460. if (j >= (1u<<(t-1)) - 1) break;
  461. int delta = (j+1) ^ ((j+1)>>1) ^ gray;
  462. for (k=0; delta>1; k++)
  463. delta >>=1;
  464. if (gray & (1<<k)) {
  465. /* start += doubles[k] */
  466. add_tw_pniels_to_tw_extensible(&start, &doubles[k]);
  467. } else {
  468. /* start -= doubles[k] */
  469. sub_tw_pniels_from_tw_extensible(&start, &doubles[k]);
  470. }
  471. }
  472. }
  473. simultaneous_invert(zis, zs, n<<(t-1));
  474. field_t product;
  475. for (i=0; i<n<<(t-1); i++) {
  476. field_mul(&product, &table[i].a, &zis[i]);
  477. field_strong_reduce(&product);
  478. field_copy(&table[i].a, &product);
  479. field_mul(&product, &table[i].b, &zis[i]);
  480. field_strong_reduce(&product);
  481. field_copy(&table[i].b, &product);
  482. field_mul(&product, &table[i].c, &zis[i]);
  483. field_strong_reduce(&product);
  484. field_copy(&table[i].c, &product);
  485. }
  486. mask_t ret = ~field_is_zero(&zis[0]);
  487. free(doubles);
  488. free(zs);
  489. free(zis);
  490. if (unlikely(!ret)) {
  491. really_memset(table, 0, sizeof(*table) * (n<<(t-1)));
  492. if (!prealloc) free(table);
  493. really_memset(out, 0, sizeof(*out));
  494. return 0;
  495. }
  496. return ret;
  497. }
  498. void
  499. destroy_fixed_base (
  500. struct fixed_base_table_t *table
  501. ) {
  502. if (table->table) {
  503. really_memset(table->table,0,sizeof(*table->table)*(table->n<<(table->t-1)));
  504. }
  505. if (table->own_table) {
  506. free(table->table);
  507. }
  508. really_memset(table,0,sizeof(*table));
  509. }
  510. mask_t
  511. precompute_fixed_base_wnaf (
  512. struct tw_niels_t *out,
  513. const struct tw_extensible_t *const_base,
  514. unsigned int tbits
  515. ) {
  516. int i;
  517. struct field_t *zs = (struct field_t *) malloc_vector(sizeof(*zs)<<tbits);
  518. struct field_t *zis = (struct field_t *) malloc_vector(sizeof(*zis)<<tbits);
  519. if (!zs || !zis) {
  520. free(zs);
  521. free(zis);
  522. return 0;
  523. }
  524. struct tw_extensible_t base;
  525. copy_tw_extensible(&base,const_base);
  526. struct tw_pniels_t twop, tmp;
  527. convert_tw_extensible_to_tw_pniels(&tmp, &base);
  528. field_copy(&zs[0], &tmp.z);
  529. copy_tw_niels(&out[0], &tmp.n);
  530. if (tbits > 0) {
  531. double_tw_extensible(&base);
  532. convert_tw_extensible_to_tw_pniels(&twop, &base);
  533. add_tw_pniels_to_tw_extensible(&base, &tmp);
  534. convert_tw_extensible_to_tw_pniels(&tmp, &base);
  535. field_copy(&zs[1], &tmp.z);
  536. copy_tw_niels(&out[1], &tmp.n);
  537. for (i=2; i < 1<<tbits; i++) {
  538. add_tw_pniels_to_tw_extensible(&base, &twop);
  539. convert_tw_extensible_to_tw_pniels(&tmp, &base);
  540. field_copy(&zs[i], &tmp.z);
  541. copy_tw_niels(&out[i], &tmp.n);
  542. }
  543. }
  544. simultaneous_invert(zis, zs, 1<<tbits);
  545. field_t product;
  546. for (i=0; i<1<<tbits; i++) {
  547. field_mul(&product, &out[i].a, &zis[i]);
  548. field_strong_reduce(&product);
  549. field_copy(&out[i].a, &product);
  550. field_mul(&product, &out[i].b, &zis[i]);
  551. field_strong_reduce(&product);
  552. field_copy(&out[i].b, &product);
  553. field_mul(&product, &out[i].c, &zis[i]);
  554. field_strong_reduce(&product);
  555. field_copy(&out[i].c, &product);
  556. }
  557. free(zs);
  558. free(zis);
  559. return -1;
  560. }
  561. /**
  562. * @cond internal
  563. * Control for variable-time scalar multiply algorithms.
  564. */
  565. struct smvt_control {
  566. int power, addend;
  567. };
  568. static int
  569. recode_wnaf(
  570. struct smvt_control *control, /* [nbits/(tableBits+1) + 3] */
  571. const word_t *scalar,
  572. unsigned int nbits,
  573. unsigned int tableBits)
  574. {
  575. int current = 0, i, j;
  576. unsigned int position = 0;
  577. /* PERF: negate scalar if it's large
  578. * PERF: this is a pretty simplistic algorithm. I'm sure there's a faster one...
  579. */
  580. for (i=nbits-1; i >= 0; i--) {
  581. int bit = (scalar[i/WORD_BITS] >> (i%WORD_BITS)) & 1;
  582. current = 2*current + bit;
  583. /*
  584. * Sizing: |current| >= 2^(tableBits+1) -> |current| = 2^0
  585. * So current loses (tableBits+1) bits every time. It otherwise gains
  586. * 1 bit per iteration. The number of iterations is
  587. * (nbits + 2 + tableBits), and an additional control word is added at
  588. * the end. So the total number of control words is at most
  589. * ceil((nbits+1) / (tableBits+1)) + 2 = floor((nbits)/(tableBits+1)) + 2.
  590. * There's also the stopper with power -1, for a total of +3.
  591. */
  592. if (current >= (2<<tableBits) || current <= -1 - (2<<tableBits)) {
  593. int delta = (current + 1) >> 1; /* |delta| < 2^tablebits */
  594. current = -(current & 1);
  595. for (j=i; (delta & 1) == 0; j++) {
  596. delta >>= 1;
  597. }
  598. control[position].power = j+1;
  599. control[position].addend = delta;
  600. position++;
  601. assert(position <= nbits/(tableBits+1) + 2);
  602. }
  603. }
  604. if (current) {
  605. for (j=0; (current & 1) == 0; j++) {
  606. current >>= 1;
  607. }
  608. control[position].power = j;
  609. control[position].addend = current;
  610. position++;
  611. assert(position <= nbits/(tableBits+1) + 2);
  612. }
  613. control[position].power = -1;
  614. control[position].addend = 0;
  615. return position;
  616. }
  617. static void
  618. prepare_wnaf_table(
  619. struct tw_pniels_t *output,
  620. struct tw_extensible_t *working,
  621. unsigned int tbits
  622. ) {
  623. convert_tw_extensible_to_tw_pniels(&output[0], working);
  624. if (tbits == 0) return;
  625. double_tw_extensible(working);
  626. struct tw_pniels_t twop;
  627. convert_tw_extensible_to_tw_pniels(&twop, working);
  628. add_tw_pniels_to_tw_extensible(working, &output[0]);
  629. convert_tw_extensible_to_tw_pniels(&output[1], working);
  630. for (int i=2; i < 1<<tbits; i++) {
  631. add_tw_pniels_to_tw_extensible(working, &twop);
  632. convert_tw_extensible_to_tw_pniels(&output[i], working);
  633. }
  634. }
  635. void
  636. scalarmul_vt (
  637. struct tw_extensible_t *working,
  638. const word_t scalar[SCALAR_WORDS],
  639. unsigned int nbits
  640. ) {
  641. const int table_bits = SCALARMUL_WNAF_TABLE_BITS;
  642. struct smvt_control control[nbits/(table_bits+1)+3];
  643. int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  644. struct tw_pniels_t precmp[1<<table_bits];
  645. prepare_wnaf_table(precmp, working, table_bits);
  646. if (control_bits > 0) {
  647. assert(control[0].addend > 0);
  648. assert(control[0].power >= 0);
  649. convert_tw_pniels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
  650. } else {
  651. set_identity_tw_extensible(working);
  652. return;
  653. }
  654. int conti = 1, i;
  655. for (i = control[0].power - 1; i >= 0; i--) {
  656. double_tw_extensible(working);
  657. if (i == control[conti].power) {
  658. assert(control[conti].addend);
  659. if (control[conti].addend > 0) {
  660. add_tw_pniels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
  661. } else {
  662. sub_tw_pniels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
  663. }
  664. conti++;
  665. assert(conti <= control_bits);
  666. }
  667. }
  668. }
  669. void
  670. scalarmul_fixed_base_wnaf_vt (
  671. struct tw_extensible_t *working,
  672. const word_t scalar[SCALAR_WORDS],
  673. unsigned int nbits,
  674. const struct tw_niels_t *precmp,
  675. unsigned int table_bits
  676. ) {
  677. struct smvt_control control[nbits/(table_bits+1)+3];
  678. int control_bits = recode_wnaf(control, scalar, nbits, table_bits);
  679. if (control_bits > 0) {
  680. assert(control[0].addend > 0);
  681. assert(control[0].power >= 0);
  682. convert_tw_niels_to_tw_extensible(working, &precmp[control[0].addend >> 1]);
  683. } else {
  684. set_identity_tw_extensible(working);
  685. return;
  686. }
  687. int conti = 1, i;
  688. for (; control[conti].power >= 0; conti++) {
  689. assert(conti <= control_bits);
  690. for (i = control[conti-1].power - control[conti].power; i; i--) {
  691. double_tw_extensible(working);
  692. }
  693. assert(control[conti].addend);
  694. if (control[conti].addend > 0) {
  695. add_tw_niels_to_tw_extensible(working, &precmp[control[conti].addend >> 1]);
  696. } else {
  697. sub_tw_niels_from_tw_extensible(working, &precmp[(-control[conti].addend) >> 1]);
  698. }
  699. }
  700. for (i = control[conti-1].power; i; i--) {
  701. double_tw_extensible(working);
  702. }
  703. }
  704. void
  705. linear_combo_var_fixed_vt(
  706. struct tw_extensible_t *working,
  707. const word_t scalar_var[SCALAR_WORDS],
  708. unsigned int nbits_var,
  709. const word_t scalar_pre[SCALAR_WORDS],
  710. unsigned int nbits_pre,
  711. const struct tw_niels_t *precmp,
  712. unsigned int table_bits_pre
  713. ) {
  714. const int table_bits_var = SCALARMUL_WNAF_COMBO_TABLE_BITS;
  715. struct smvt_control control_var[nbits_var/(table_bits_var+1)+3];
  716. struct smvt_control control_pre[nbits_pre/(table_bits_pre+1)+3];
  717. int ncb_var = recode_wnaf(control_var, scalar_var, nbits_var, table_bits_var);
  718. int ncb_pre = recode_wnaf(control_pre, scalar_pre, nbits_pre, table_bits_pre);
  719. (void)ncb_var;
  720. (void)ncb_pre;
  721. struct tw_pniels_t precmp_var[1<<table_bits_var];
  722. prepare_wnaf_table(precmp_var, working, table_bits_var);
  723. int contp=0, contv=0, i;
  724. i = control_var[0].power;
  725. if (i > control_pre[0].power) {
  726. convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
  727. contv++;
  728. } else if (i == control_pre[0].power && i >=0 ) {
  729. convert_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[0].addend >> 1]);
  730. add_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
  731. contv++; contp++;
  732. } else {
  733. i = control_pre[0].power;
  734. convert_tw_niels_to_tw_extensible(working, &precmp[control_pre[0].addend >> 1]);
  735. contp++;
  736. }
  737. if (i < 0) {
  738. set_identity_tw_extensible(working);
  739. return;
  740. }
  741. for (i--; i >= 0; i--) {
  742. double_tw_extensible(working);
  743. if (i == control_var[contv].power) {
  744. assert(control_var[contv].addend);
  745. if (control_var[contv].addend > 0) {
  746. add_tw_pniels_to_tw_extensible(working, &precmp_var[control_var[contv].addend >> 1]);
  747. } else {
  748. sub_tw_pniels_from_tw_extensible(working, &precmp_var[(-control_var[contv].addend) >> 1]);
  749. }
  750. contv++;
  751. }
  752. if (i == control_pre[contp].power) {
  753. assert(control_pre[contp].addend);
  754. if (control_pre[contp].addend > 0) {
  755. add_tw_niels_to_tw_extensible(working, &precmp[control_pre[contp].addend >> 1]);
  756. } else {
  757. sub_tw_niels_from_tw_extensible(working, &precmp[(-control_pre[contp].addend) >> 1]);
  758. }
  759. contp++;
  760. }
  761. }
  762. assert(contv == ncb_var);
  763. assert(contp == ncb_pre);
  764. }