diff --git a/Makefile b/Makefile
index 9271b29..32ba262 100644
--- a/Makefile
+++ b/Makefile
@@ -165,7 +165,8 @@ endef
 ################################################################
 define define_curve
 
-LIBCOMPONENTS += $$(BUILD_OBJ)/$(1)/decaf.o $$(BUILD_OBJ)/$(1)/crypto.o $$(BUILD_OBJ)/$(1)/decaf_tables.o
+LIBCOMPONENTS += $$(BUILD_OBJ)/$(1)/decaf.o $$(BUILD_OBJ)/$(1)/elligator.o $$(BUILD_OBJ)/$(1)/scalar.o \
+	 $$(BUILD_OBJ)/$(1)/crypto.o $$(BUILD_OBJ)/$(1)/decaf_tables.o
 PER_OBJ_DIRS += $$(BUILD_OBJ)/$(1)
 GLOBAL_HEADERS_OF_$(1) = $(BUILD_INC)/decaf/decaf_$(3).h $(BUILD_INC)/decaf/decaf_$(3).hxx \
 		$(BUILD_INC)/decaf/crypto_$(3).h $(BUILD_INC)/decaf/crypto_$(3).hxx
@@ -181,11 +182,17 @@ $$(BUILD_H)/$(1)/%.h: src/per_curve/%.tmpl.h src/gen_headers/* $$(HEADERS_OF_$(2
 $$(BUILD_INC)/decaf/decaf_$(3).%: src/per_curve/decaf.tmpl.% src/gen_headers/* $$(HEADERS_OF_$(2))
 	python -B src/gen_headers/template.py --per=curve --item=$(1) --guard=$$(@:$(BUILD_INC)/%=%) -o $$@ $$<
 	
+$$(BUILD_INC)/decaf/elligator_$(3).%: src/per_curve/elligator.tmpl.% src/gen_headers/* $$(HEADERS_OF_$(2))
+	python -B src/gen_headers/template.py --per=curve --item=$(1) --guard=$$(@:$(BUILD_INC)/%=%) -o $$@ $$<
+	
+$$(BUILD_INC)/decaf/scalar_$(3).%: src/per_curve/scalar.tmpl.% src/gen_headers/* $$(HEADERS_OF_$(2))
+	python -B src/gen_headers/template.py --per=curve --item=$(1) --guard=$$(@:$(BUILD_INC)/%=%) -o $$@ $$<
+	
 $$(BUILD_INC)/decaf/crypto_$(3).%: src/per_curve/crypto.tmpl.% src/gen_headers/* $$(HEADERS_OF_$(2))
 	python -B src/gen_headers/template.py --per=curve --item=$(1) --guard=$$(@:$(BUILD_INC)/%=%) -o $$@ $$<
 
 $$(BUILD_IBIN)/decaf_gen_tables_$(1): $$(BUILD_OBJ)/$(1)/decaf_gen_tables.o \
-		$$(BUILD_OBJ)/$(1)/decaf.o $$(BUILD_OBJ)/utils.o \
+		$$(BUILD_OBJ)/$(1)/decaf.o $$(BUILD_OBJ)/$(1)/scalar.o $$(BUILD_OBJ)/utils.o \
 		$$(COMPONENTS_OF_$(2))
 	$$(LD) $$(LDFLAGS) -o $$@ $$^
 
diff --git a/src/include/field.h b/src/include/field.h
index 065b15e..989e1e1 100644
--- a/src/include/field.h
+++ b/src/include/field.h
@@ -15,8 +15,7 @@
 #include <string.h>
     
 /** Square x, n times. */
-static INLINE UNUSED void
-gf_sqrn (
+static INLINE UNUSED void gf_sqrn (
     gf_s *__restrict__ y,
     const gf x,
     int n
@@ -58,5 +57,44 @@ static inline void gf_subx_nr ( gf c, const gf a, const gf b, int amt ) {
     if (sizeof(word_t)==4) gf_weak_reduce(c); // HACK PERF MAGIC
 }
 
+/** Mul by signed int.  Not constant-time WRT the sign of that int. */
+static inline void gf_mulw(gf c, const gf a, int32_t w) {
+    if (w>0) {
+        gf_mulw_unsigned(c, a, w);
+    } else {
+        gf_mulw_unsigned(c, a, -w);
+        gf_sub(c,ZERO,c);
+    }
+}
+
+/** Constant time, x = is_z ? z : y */
+static inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z) {
+    constant_time_select(x,y,z,sizeof(gf),is_z,0);
+}
+
+/** Constant time, if (neg) x=-x; */
+static inline void gf_cond_neg(gf x, mask_t neg) {
+    gf y;
+    gf_sub(y,ZERO,x);
+    gf_cond_sel(x,x,y,neg);
+}
+
+/** Constant time, if (swap) (x,y) = (y,x); */
+static inline void
+gf_cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
+    constant_time_cond_swap(x,y,sizeof(gf_s),swap);
+}
+
+static INLINE void gf_mul_qnr(gf_s *__restrict__ out, gf x) {
+#if P_MOD_8 == 5
+    /* r = QNR * r0^2 */
+    gf_mul(out,x,SQRT_MINUS_ONE);
+#elif P_MOD_8 == 3 || P_MOD_8 == 7
+    gf_sub(out,ZERO,x);
+#else
+    #error "Only supporting p=3,5,7 mod 8"
+#endif
+}
+
 
 #endif // __GF_H__
diff --git a/src/include/word.h b/src/include/word.h
index 0415f6f..5c74af6 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -7,7 +7,10 @@
 
 /* for posix_memalign */
 #define _XOPEN_SOURCE 600
+#define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
+#include <string.h>
 
+#include <assert.h>
 #include <stdint.h>
 #include "arch_intrinsics.h"
 
@@ -240,4 +243,36 @@ malloc_vector(size_t size) {
 #define UNROLL
 #endif
 
+/* The plan on booleans:
+ *
+ * The external interface uses decaf_bool_t, but this might be a different
+ * size than our particular arch's word_t (and thus mask_t).  Also, the caller
+ * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
+ * and checks nonzero.
+ *
+ * On the flip side, mask_t is always -1 or 0, but it might be a different size
+ * than decaf_bool_t.
+ *
+ * On the third hand, we have success vs boolean types, but that's handled in
+ * common.h: it converts between decaf_bool_t and decaf_error_t.
+ */
+static INLINE decaf_bool_t mask_to_bool (mask_t m) {
+    return (decaf_sword_t)(sword_t)m;
+}
+
+static INLINE mask_t bool_to_mask (decaf_bool_t m) {
+    /* On most arches this will be optimized to a simple cast. */
+    mask_t ret = 0;
+    unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
+    if (limit < 1) limit = 1;
+    for (unsigned int i=0; i<limit; i++) {
+        ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
+    }
+    return ret;
+}
+
+static INLINE void ignore_result ( decaf_bool_t boo ) {
+    (void)boo;
+}
+
 #endif /* __WORD_H__ */
diff --git a/src/p25519/arch_32/f_impl.c b/src/p25519/arch_32/f_impl.c
index 656d9f7..93d03bc 100644
--- a/src/p25519/arch_32/f_impl.c
+++ b/src/p25519/arch_32/f_impl.c
@@ -51,7 +51,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += accum;
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint32_t *a = as->limb, maske = ((1<<26)-1), masko = ((1<<25)-1);
     uint32_t blo = b & maske, bhi = b>>26, bhi2 = 2*bhi;
     uint32_t *c = cs->limb;
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index 1f0e22d..484dadf 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -34,7 +34,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += accum;
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     int i;
     
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index 78ad40f..ecc27a3 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -140,7 +140,7 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     c[1] = c1 + (accum1>>51);
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
 
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index a07aae5..1abffe7 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -60,7 +60,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += ((uint32_t)(accum1));
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     assert(b<1<<28);
     
     const uint32_t *a = as->limb;
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index 887c083..0454bd6 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -721,7 +721,7 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     c[1] += ((uint32_t)(accum1));
 }
 
-void gf_mulw (
+void gf_mulw_unsigned (
     gf_s *__restrict__ cs,
     const gf as,
     uint32_t b
diff --git a/src/p448/arch_neon/f_impl.c b/src/p448/arch_neon/f_impl.c
index ba0e303..d53e5ee 100644
--- a/src/p448/arch_neon/f_impl.c
+++ b/src/p448/arch_neon/f_impl.c
@@ -549,7 +549,7 @@ void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
     );
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) { 
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { 
     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
     assert(b<(1<<28));
     
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index 4273d3d..5268100 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -165,7 +165,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[1] += ((uint64_t)(accum1));
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 4989cb5..7bdf561 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -139,7 +139,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     c[0] += ((uint64_t)(accum1));
 }
 
-void gf_mulw (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
diff --git a/src/per_curve/decaf.tmpl.c b/src/per_curve/decaf.tmpl.c
index 26458fc..edcbdfd 100644
--- a/src/per_curve/decaf.tmpl.c
+++ b/src/per_curve/decaf.tmpl.c
@@ -1,9 +1,6 @@
 /** @brief Decaf high-level functions. */
 
 #define _XOPEN_SOURCE 600 /* for posix_memalign */
-#define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
-#include <string.h>
-
 #include "word.h"
 #include "field.h"
 
@@ -29,16 +26,11 @@
 #define DECAF_WNAF_VAR_TABLE_BITS $(wnaf.var)
 
 static const int EDWARDS_D = $(d);
-static const scalar_t sc_p = {{{
-    $(ser(q,64,"SC_LIMB"))
-}}}, sc_r2 = {{{
-    $(ser(((2**128)**((scalar_bits+63)/64))%q,64,"SC_LIMB"))
-}}}, point_scalarmul_adjustment = {{{
+static const scalar_t point_scalarmul_adjustment = {{{
     $(ser((2**(scalar_bits-1+window_bits - ((scalar_bits-1)%window_bits)) - 1) % q,64,"SC_LIMB"))
 }}}, precomputed_scalarmul_adjustment = {{{
     $(ser((2**(combs.n*combs.t*combs.s) - 1) % q,64,"SC_LIMB"))
 }}};
-static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x$("%x" % pow(-q,2**64-1,2**64))ull;
 
 const uint8_t API_NS(x_base_point)[X_SER_BYTES] = { $(ser(mont_base,8)) };
 
@@ -50,27 +42,26 @@ const uint8_t API_NS(x_base_point)[X_SER_BYTES] = { $(ser(mont_base,8)) };
 
 /* End of template stuff */
 
-
+/* Sanity */
 #if (COFACTOR == 8) && !IMAGINE_TWIST
 /* FUTURE: Curve41417 doesn't have these properties. */
-#error "Currently require IMAGINE_TWIST (and thus p=5 mod 8) for cofactor 8"
+    #error "Currently require IMAGINE_TWIST (and thus p=5 mod 8) for cofactor 8"
 #endif
 
 #if IMAGINE_TWIST && (P_MOD_8 != 5)
-#error "Cannot use IMAGINE_TWIST except for p == 5 mod 8"
+    #error "Cannot use IMAGINE_TWIST except for p == 5 mod 8"
 #endif
 
 #if (COFACTOR != 8) && (COFACTOR != 4)
-#error "COFACTOR must be 4 or 8"
+    #error "COFACTOR must be 4 or 8"
 #endif
  
 #if IMAGINE_TWIST
-extern const gf SQRT_MINUS_ONE;
+    extern const gf SQRT_MINUS_ONE;
 #endif
 
 #define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
 
-const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
 extern const point_t API_NS(point_base);
 
 /* Projective Niels coordinates */
@@ -88,57 +79,6 @@ const precomputed_s *API_NS(precomputed_base) =
 const size_t API_NS(sizeof_precomputed_s) = sizeof(precomputed_s);
 const size_t API_NS(alignof_precomputed_s) = sizeof(big_register_t);
 
-#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++)  { op; }}
-#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++)  { op; }}
-
-/* The plan on booleans:
- *
- * The external interface uses decaf_bool_t, but this might be a different
- * size than our particular arch's word_t (and thus mask_t).  Also, the caller
- * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
- * and checks nonzero.
- *
- * On the flip side, mask_t is always -1 or 0, but it might be a different size
- * than decaf_bool_t.
- *
- * On the third hand, we have success vs boolean types, but that's handled in
- * common.h: it converts between decaf_bool_t and decaf_error_t.
- */
-static INLINE decaf_bool_t mask_to_bool (mask_t m) {
-    return (decaf_sword_t)(sword_t)m;
-}
-
-static INLINE mask_t bool_to_mask (decaf_bool_t m) {
-    /* On most arches this will be optimized to a simple cast. */
-    mask_t ret = 0;
-    unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
-    if (limit < 1) limit = 1;
-    for (unsigned int i=0; i<limit; i++) {
-        ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
-    }
-    return ret;
-}
-
-/** Constant time, x = is_z ? z : y */
-static INLINE void
-cond_sel(gf x, const gf y, const gf z, mask_t is_z) {
-    constant_time_select(x,y,z,sizeof(gf),is_z,0);
-}
-
-/** Constant time, if (neg) x=-x; */
-static void
-cond_neg(gf x, mask_t neg) {
-    gf y;
-    gf_sub(y,ZERO,x);
-    cond_sel(x,x,y,neg);
-}
-
-/** Constant time, if (swap) (x,y) = (y,x); */
-static INLINE void
-cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
-    constant_time_cond_swap(x,y,sizeof(gf_s),swap);
-}
-
 /** Inverse. */
 static void
 gf_invert(gf y, const gf x) {
@@ -151,17 +91,6 @@ gf_invert(gf y, const gf x) {
     gf_copy(y, t2);
 }
 
-/** Mul by signed int.  Not constant-time WRT the sign of that int. */
-static INLINE void
-gf_mulw_sgn(gf c, const gf a, int32_t w) {
-    if (w>0) {
-        gf_mulw(c, a, w);
-    } else {
-        gf_mulw(c, a, -w);
-        gf_sub(c,ZERO,c);
-    }
-}
-
 #if COFACTOR==8
 /** Return high bit of x = low bit of 2x mod p */
 static mask_t gf_lobit(const gf x) {
@@ -172,221 +101,10 @@ static mask_t gf_lobit(const gf x) {
 }
 #endif
 
-/** {extra,accum} - sub +? p
- * Must have extra <= 1
- */
-static NOINLINE void
-sc_subx(
-    scalar_t out,
-    const decaf_word_t accum[SCALAR_LIMBS],
-    const scalar_t sub,
-    const scalar_t p,
-    decaf_word_t extra
-) {
-    decaf_dsword_t chain = 0;
-    unsigned int i;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        chain = (chain + accum[i]) - sub->limb[i];
-        out->limb[i] = chain;
-        chain >>= WBITS;
-    }
-    decaf_word_t borrow = chain+extra; /* = 0 or -1 */
-    
-    chain = 0;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
-        out->limb[i] = chain;
-        chain >>= WBITS;
-    }
-}
-
-static NOINLINE void
-sc_montmul (
-    scalar_t out,
-    const scalar_t a,
-    const scalar_t b
-) {
-    unsigned int i,j;
-    decaf_word_t accum[SCALAR_LIMBS+1] = {0};
-    decaf_word_t hi_carry = 0;
-    
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        decaf_word_t mand = a->limb[i];
-        const decaf_word_t *mier = b->limb;
-        
-        decaf_dword_t chain = 0;
-        for (j=0; j<SCALAR_LIMBS; j++) {
-            chain += ((decaf_dword_t)mand)*mier[j] + accum[j];
-            accum[j] = chain;
-            chain >>= WBITS;
-        }
-        accum[j] = chain;
-        
-        mand = accum[0] * MONTGOMERY_FACTOR;
-        chain = 0;
-        mier = sc_p->limb;
-        for (j=0; j<SCALAR_LIMBS; j++) {
-            chain += (decaf_dword_t)mand*mier[j] + accum[j];
-            if (j) accum[j-1] = chain;
-            chain >>= WBITS;
-        }
-        chain += accum[j];
-        chain += hi_carry;
-        accum[j-1] = chain;
-        hi_carry = chain >> WBITS;
-    }
-    
-    sc_subx(out, accum, sc_p, sc_p, hi_carry);
-}
-
-void API_NS(scalar_mul) (
-    scalar_t out,
-    const scalar_t a,
-    const scalar_t b
-) {
-    sc_montmul(out,a,b);
-    sc_montmul(out,out,sc_r2);
-}
-
-/* PERF: could implement this */
-static INLINE void sc_montsqr (scalar_t out, const scalar_t a) {
-    sc_montmul(out,a,a);
-}
-
-decaf_error_t API_NS(scalar_invert) (
-    scalar_t out,
-    const scalar_t a
-) {
-    /* Fermat's little theorem, sliding window.
-     * Sliding window is fine here because the modulus isn't secret.
-     */
-    const int SCALAR_WINDOW_BITS = 3;
-    scalar_t precmp[1<<SCALAR_WINDOW_BITS];
-    const int LAST = (1<<SCALAR_WINDOW_BITS)-1;
-
-    /* Precompute precmp = [a^1,a^3,...] */
-    sc_montmul(precmp[0],a,sc_r2);
-    if (LAST > 0) sc_montmul(precmp[LAST],precmp[0],precmp[0]);
-
-    int i;
-    for (i=1; i<=LAST; i++) {
-        sc_montmul(precmp[i],precmp[i-1],precmp[LAST]);
-    }
-    
-    /* Sliding window */
-    unsigned residue = 0, trailing = 0, started = 0;
-    for (i=SCALAR_BITS-1; i>=-SCALAR_WINDOW_BITS; i--) {
-        
-        if (started) sc_montsqr(out,out);
-        
-        decaf_word_t w = (i>=0) ? sc_p->limb[i/WBITS] : 0;
-        if (i >= 0 && i<WBITS) {
-            assert(w >= 2);
-            w-=2;
-        }
-        
-        residue = (residue<<1) | ((w>>(i%WBITS))&1);
-        if (residue>>SCALAR_WINDOW_BITS != 0) {
-            assert(trailing == 0);
-            trailing = residue;
-            residue = 0;
-        }
-        
-        if (trailing > 0 && (trailing & ((1<<SCALAR_WINDOW_BITS)-1)) == 0) {
-            if (started) {
-                sc_montmul(out,out,precmp[trailing>>(SCALAR_WINDOW_BITS+1)]);
-            } else {
-                API_NS(scalar_copy)(out,precmp[trailing>>(SCALAR_WINDOW_BITS+1)]);
-                started = 1;
-            }
-            trailing = 0;
-        }
-        trailing <<= 1;
-        
-    }
-    assert(residue==0);
-    assert(trailing==0);
-    
-    /* Demontgomerize */
-    sc_montmul(out,out,API_NS(scalar_one));
-    decaf_bzero(precmp, sizeof(precmp));
-    return decaf_succeed_if(~API_NS(scalar_eq)(out,API_NS(scalar_zero)));
-}
-
-void API_NS(scalar_sub) (
-    scalar_t out,
-    const scalar_t a,
-    const scalar_t b
-) {
-    sc_subx(out, a->limb, b, sc_p, 0);
-}
-
-void API_NS(scalar_add) (
-    scalar_t out,
-    const scalar_t a,
-    const scalar_t b
-) {
-    decaf_dword_t chain = 0;
-    unsigned int i;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        chain = (chain + a->limb[i]) + b->limb[i];
-        out->limb[i] = chain;
-        chain >>= WBITS;
-    }
-    sc_subx(out, out->limb, sc_p, sc_p, chain);
-}
-
-static NOINLINE void
-sc_halve (
-    scalar_t out,
-    const scalar_t a,
-    const scalar_t p
-) {
-    decaf_word_t mask = -(a->limb[0] & 1);
-    decaf_dword_t chain = 0;
-    unsigned int i;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        chain = (chain + a->limb[i]) + (p->limb[i] & mask);
-        out->limb[i] = chain;
-        chain >>= WBITS;
-    }
-    for (i=0; i<SCALAR_LIMBS-1; i++) {
-        out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1);
-    }
-    out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1);
-}
-
-void
-API_NS(scalar_set_unsigned) (
-    scalar_t out,
-    uint64_t w
-) {
-    memset(out,0,sizeof(scalar_t));
-    unsigned int i = 0;
-    for (; i<sizeof(uint64_t)/sizeof(decaf_word_t); i++) {
-        out->limb[i] = w;
-        w >>= (sizeof(uint64_t) > sizeof(decaf_word_t)) ? 8*sizeof(decaf_word_t) : 0;
-    }
-}
-
-decaf_bool_t
-API_NS(scalar_eq) (
-    const scalar_t a,
-    const scalar_t b
-) {
-    decaf_word_t diff = 0;
-    unsigned int i;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        diff |= a->limb[i] ^ b->limb[i];
-    }
-    return mask_to_bool(word_is_zero(diff));
-}
-
 /** identity = (0,1) */
 const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
 
-static void
-deisogenize (
+void API_NS(deisogenize) (
     gf_s *__restrict__ s,
     gf_s *__restrict__ minus_t_over_s,
     const point_t p,
@@ -399,28 +117,28 @@ deisogenize (
     
     gf b, d;
     gf_s *c = s, *a = minus_t_over_s;
-    gf_mulw_sgn(a, p->y, 1-EDWARDS_D);
+    gf_mulw(a, p->y, 1-EDWARDS_D);
     gf_mul(c, a, p->t);     /* -dYT, with EDWARDS_D = d-1 */
     gf_mul(a, p->x, p->z); 
     gf_sub(d, c, a);  /* aXZ-dYT with a=-1 */
     gf_add(a, p->z, p->y); 
     gf_sub(b, p->z, p->y); 
     gf_mul(c, b, a);
-    gf_mulw_sgn(b, c, -EDWARDS_D); /* (a-d)(Z+Y)(Z-Y) */
+    gf_mulw(b, c, -EDWARDS_D); /* (a-d)(Z+Y)(Z-Y) */
     mask_t ok = gf_isr (a,b); /* r in the paper */
     (void)ok; assert(ok | gf_eq(b,ZERO));
-    gf_mulw_sgn (b, a, -EDWARDS_D); /* u in the paper */
+    gf_mulw (b, a, -EDWARDS_D); /* u in the paper */
 
     gf_mul(c,a,d); /* r(aZX-dYT) */
     gf_mul(a,b,p->z); /* uZ */
     gf_add(a,a,a); /* 2uZ */
     
-    cond_neg(c, toggle_hibit_t_over_s ^ ~gf_hibit(a)); /* u <- -u if negative. */
-    cond_neg(a, toggle_hibit_t_over_s ^ ~gf_hibit(a)); /* t/s <-? -t/s */
+    gf_cond_neg(c, toggle_hibit_t_over_s ^ ~gf_hibit(a)); /* u <- -u if negative. */
+    gf_cond_neg(a, toggle_hibit_t_over_s ^ ~gf_hibit(a)); /* t/s <-? -t/s */
     
     gf_add(d,c,p->y);
     gf_mul(s,b,d);
-    cond_neg(s, toggle_hibit_s ^ gf_hibit(s));
+    gf_cond_neg(s, toggle_hibit_s ^ gf_hibit(s));
 #else
     /* More complicated because of rotation */
     /* MAGIC This code is wrong for certain non-Curve25519 curves;
@@ -441,7 +159,7 @@ deisogenize (
         gf_mul ( c, a, b ); /* "zx" = Z^2 - aX^2 = Z^2 - X^2 */
     #else
         const gf_s *x = p->x, *t = p->t;
-        /* Won't hit the cond_sel below because COFACTOR==8 requires IMAGINE_TWIST for now. */
+        /* Won't hit the gf_cond_sel below because COFACTOR==8 requires IMAGINE_TWIST for now. */
     
         gf_sqr ( a, p->z );
         gf_sqr ( b, p->x );
@@ -465,8 +183,8 @@ deisogenize (
         rotate = gf_hibit(a) ^ toggle_rotation;
         /* Curve25519: cond select between zx * 1/tz or sqrt(1-d); y=-x */
         gf_mul ( a, b, c ); 
-        cond_sel ( a, a, SQRT_ONE_MINUS_D, rotate );
-        cond_sel ( x, p->y, x, rotate );
+        gf_cond_sel ( a, a, SQRT_ONE_MINUS_D, rotate );
+        gf_cond_sel ( x, p->y, x, rotate );
     #else
         (void)toggle_rotation;
         rotate = 0;
@@ -476,19 +194,19 @@ deisogenize (
     gf_mul ( a, c, p->z );
     gf_add ( a, a, a ); // 2 * "osx" * Z
     mask_t tg1 = rotate ^ toggle_hibit_t_over_s ^~ gf_hibit(a);
-    cond_neg ( c, tg1 );
-    cond_neg ( a, rotate ^ tg1 );
+    gf_cond_neg ( c, tg1 );
+    gf_cond_neg ( a, rotate ^ tg1 );
     gf_mul ( d, b, p->z );
     gf_add ( d, d, c );
     gf_mul ( b, d, x ); /* here "x" = y unless rotate */
-    cond_neg ( b, toggle_hibit_s ^ gf_hibit(b) );
+    gf_cond_neg ( b, toggle_hibit_s ^ gf_hibit(b) );
     
 #endif
 }
 
 void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
     gf s, mtos;
-    deisogenize(s,mtos,p,0,0,0);
+    API_NS(deisogenize)(s,mtos,p,0,0,0);
     gf_serialize(ser,s,0);
 }
 
@@ -509,7 +227,7 @@ decaf_error_t API_NS(point_decode) (
 #endif
     succ &= ~ gf_eq( f, ZERO );
     gf_sqr ( b, f ); 
-    gf_mulw_sgn ( c, a, 4*IMAGINE_TWIST-4*EDWARDS_D ); 
+    gf_mulw ( c, a, 4*IMAGINE_TWIST-4*EDWARDS_D ); 
     gf_add ( c, c, b ); /* t^2 */
     gf_mul ( d, f, s ); /* s(1-as^2) for denoms */
     gf_sqr ( e, d );
@@ -520,8 +238,8 @@ decaf_error_t API_NS(point_decode) (
     gf_mul ( d, e, c ); /* d = t / (s(1-as^2)) */
     gf_mul ( e, d, f ); /* t/s */
     mask_t negtos = gf_hibit(e);
-    cond_neg(b, negtos);
-    cond_neg(d, negtos);
+    gf_cond_neg(b, negtos);
+    gf_cond_neg(d, negtos);
 
 #if IMAGINE_TWIST
     gf_add ( p->z, ONE, a); /* Z = 1+as^2 = 1-s^2 */
@@ -578,7 +296,7 @@ void API_NS(point_sub) (
     gf_add_nr ( b, q->y, q->x );
     gf_mul ( p->y, d, b );
     gf_mul ( b, r->t, q->t );
-    gf_mulw_sgn ( p->x, b, 2*EFF_D );
+    gf_mulw ( p->x, b, 2*EFF_D );
     gf_add_nr ( b, a, p->y );
     gf_sub_nr ( c, p->y, a );
     gf_mul ( a, q->z, r->z );
@@ -609,7 +327,7 @@ void API_NS(point_add) (
     gf_add_nr ( b, q->y, q->x );
     gf_mul ( p->y, d, b );
     gf_mul ( b, r->t, q->t );
-    gf_mulw_sgn ( p->x, b, 2*EFF_D );
+    gf_mulw ( p->x, b, 2*EFF_D );
     gf_add_nr ( b, a, p->y );
     gf_sub_nr ( c, p->y, a );
     gf_mul ( a, q->z, r->z );
@@ -664,107 +382,14 @@ void API_NS(point_negate) (
     gf_sub(nega->t, ZERO, a->t);
 }
 
-static INLINE void
-scalar_decode_short (
-    scalar_t s,
-    const unsigned char *ser,
-    unsigned int nbytes
-) {
-    unsigned int i,j,k=0;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        decaf_word_t out = 0;
-        for (j=0; j<sizeof(decaf_word_t) && k<nbytes; j++,k++) {
-            out |= ((decaf_word_t)ser[k])<<(8*j);
-        }
-        s->limb[i] = out;
-    }
-}
-
-decaf_error_t API_NS(scalar_decode)(
-    scalar_t s,
-    const unsigned char ser[SCALAR_SER_BYTES]
-) {
-    unsigned int i;
-    scalar_decode_short(s, ser, SCALAR_SER_BYTES);
-    decaf_dsword_t accum = 0;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS;
-    }
-    /* Here accum == 0 or -1 */
-    
-    API_NS(scalar_mul)(s,s,API_NS(scalar_one)); /* ham-handed reduce */
-    
-    return decaf_succeed_if(~word_is_zero(accum));
-}
-
-void API_NS(scalar_destroy) (
-    scalar_t scalar
-) {
-    decaf_bzero(scalar, sizeof(scalar_t));
-}
-
-static INLINE void ignore_result ( decaf_bool_t boo ) {
-    (void)boo;
-}
-
-void API_NS(scalar_decode_long)(
-    scalar_t s,
-    const unsigned char *ser,
-    size_t ser_len
-) {
-    if (ser_len == 0) {
-        API_NS(scalar_copy)(s, API_NS(scalar_zero));
-        return;
-    }
-    
-    size_t i;
-    scalar_t t1, t2;
-
-    i = ser_len - (ser_len%SCALAR_SER_BYTES);
-    if (i==ser_len) i -= SCALAR_SER_BYTES;
-    
-    scalar_decode_short(t1, &ser[i], ser_len-i);
-
-    if (ser_len == sizeof(scalar_t)) {
-        assert(i==0);
-        /* ham-handed reduce */
-        API_NS(scalar_mul)(s,t1,API_NS(scalar_one));
-        API_NS(scalar_destroy)(t1);
-        return;
-    }
-
-    while (i) {
-        i -= SCALAR_SER_BYTES;
-        sc_montmul(t1,t1,sc_r2);
-        ignore_result( API_NS(scalar_decode)(t2, ser+i) );
-        API_NS(scalar_add)(t1, t1, t2);
-    }
-
-    API_NS(scalar_copy)(s, t1);
-    API_NS(scalar_destroy)(t1);
-    API_NS(scalar_destroy)(t2);
-}
-
-void API_NS(scalar_encode)(
-    unsigned char ser[SCALAR_SER_BYTES],
-    const scalar_t s
-) {
-    unsigned int i,j,k=0;
-    for (i=0; i<SCALAR_LIMBS; i++) {
-        for (j=0; j<sizeof(decaf_word_t); j++,k++) {
-            ser[k] = s->limb[i] >> (8*j);
-        }
-    }
-}
-
 /* Operations on [p]niels */
 static INLINE void
 cond_neg_niels (
     niels_t n,
     mask_t neg
 ) {
-    cond_swap(n->a, n->b, neg);
-    cond_neg(n->c, neg);
+    gf_cond_swap(n->a, n->b, neg);
+    gf_cond_neg(n->c, neg);
 }
 
 static NOINLINE void pt_to_pniels (
@@ -773,7 +398,7 @@ static NOINLINE void pt_to_pniels (
 ) {
     gf_sub ( b->n->a, a->y, a->x );
     gf_add ( b->n->b, a->x, a->y );
-    gf_mulw_sgn ( b->n->c, a->t, 2*TWISTED_D );
+    gf_mulw ( b->n->c, a->t, 2*TWISTED_D );
     gf_add ( b->z, a->z, a->z );
 }
 
@@ -915,7 +540,7 @@ void API_NS(point_scalarmul) (
         
     scalar_t scalar1x;
     API_NS(scalar_add)(scalar1x, scalar, point_scalarmul_adjustment);
-    sc_halve(scalar1x,scalar1x,sc_p);
+    API_NS(scalar_halve)(scalar1x,scalar1x);
     
     /* Set up a precomputed table with odd multiples of b. */
     pniels_t pn, multiples[NTABLE];
@@ -977,9 +602,9 @@ void API_NS(point_double_scalarmul) (
         
     scalar_t scalar1x, scalar2x;
     API_NS(scalar_add)(scalar1x, scalarb, point_scalarmul_adjustment);
-    sc_halve(scalar1x,scalar1x,sc_p);
+    API_NS(scalar_halve)(scalar1x,scalar1x);
     API_NS(scalar_add)(scalar2x, scalarc, point_scalarmul_adjustment);
-    sc_halve(scalar2x,scalar2x,sc_p);
+    API_NS(scalar_halve)(scalar2x,scalar2x);
     
     /* Set up a precomputed table with odd multiples of b. */
     pniels_t pn, multiples1[NTABLE], multiples2[NTABLE];
@@ -1053,9 +678,9 @@ void API_NS(point_dual_scalarmul) (
         
     scalar_t scalar1x, scalar2x;
     API_NS(scalar_add)(scalar1x, scalar1, point_scalarmul_adjustment);
-    sc_halve(scalar1x,scalar1x,sc_p);
+    API_NS(scalar_halve)(scalar1x,scalar1x);
     API_NS(scalar_add)(scalar2x, scalar2, point_scalarmul_adjustment);
-    sc_halve(scalar2x,scalar2x,sc_p);
+    API_NS(scalar_halve)(scalar2x,scalar2x);
     
     /* Set up a precomputed table with odd multiples of b. */
     point_t multiples1[NTABLE], multiples2[NTABLE], working, tmp;
@@ -1165,144 +790,6 @@ decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) {
     return mask_to_bool(succ);
 }
 
-void API_NS(point_from_hash_nonuniform) (
-    point_t p,
-    const unsigned char ser[SER_BYTES]
-) {
-    gf r0,r,a,b,c,N,e;
-    ignore_result(gf_deserialize(r0,ser,0));
-    gf_strong_reduce(r0);
-    gf_sqr(a,r0);
-#if P_MOD_8 == 5
-    /* r = QNR * r0^2 */
-    gf_mul(r,a,SQRT_MINUS_ONE);
-#elif P_MOD_8 == 3 || P_MOD_8 == 7
-    gf_sub(r,ZERO,a);
-#else
-#error "Only supporting p=3,5,7 mod 8"
-#endif
-
-    /* Compute D@c := (dr+a-d)(dr-ar-d) with a=1 */
-    gf_sub(a,r,ONE);
-    gf_mulw_sgn(b,a,EDWARDS_D); /* dr-d */
-    gf_add(a,b,ONE);
-    gf_sub(b,b,r);
-    gf_mul(c,a,b);
-    
-    /* compute N := (r+1)(a-2d) */
-    gf_add(a,r,ONE);
-    gf_mulw_sgn(N,a,1-2*EDWARDS_D);
-    
-    /* e = +-sqrt(1/ND) or +-r0 * sqrt(qnr/ND) */
-    gf_mul(a,c,N);
-    mask_t square = gf_isr(b,a);
-    cond_sel(c,r0,ONE,square); /* r? = square ? 1 : r0 */
-    gf_mul(e,b,c);
-    
-    /* s@a = +-|N.e| */
-    gf_mul(a,N,e);
-    cond_neg(a,gf_hibit(a)^square); /* NB this is - what is listen in the paper */
-    
-    /* t@b = -+ cN(r-1)((a-2d)e)^2 - 1 */
-    gf_mulw_sgn(c,e,1-2*EDWARDS_D); /* (a-2d)e */
-    gf_sqr(b,c);
-    gf_sub(e,r,ONE);
-    gf_mul(c,b,e);
-    gf_mul(b,c,N);
-    cond_neg(b,square);
-    gf_sub(b,b,ONE);
-
-    /* isogenize */
-#if IMAGINE_TWIST
-    gf_mul(c,a,SQRT_MINUS_ONE);
-    gf_copy(a,c);
-#endif
-    
-    gf_sqr(c,a); /* s^2 */
-    gf_add(a,a,a); /* 2s */
-    gf_add(e,c,ONE);
-    gf_mul(p->t,a,e); /* 2s(1+s^2) */
-    gf_mul(p->x,a,b); /* 2st */
-    gf_sub(a,ONE,c);
-    gf_mul(p->y,e,a); /* (1+s^2)(1-s^2) */
-    gf_mul(p->z,a,b); /* (1-s^2)t */
-    
-    assert(API_NS(point_valid)(p));
-}
-
-decaf_error_t
-API_NS(invert_elligator_nonuniform) (
-    unsigned char recovered_hash[SER_BYTES],
-    const point_t p,
-    uint16_t hint_
-) {
-    mask_t hint = hint_;
-    mask_t sgn_s = -(hint & 1),
-        sgn_t_over_s = -(hint>>1 & 1),
-        sgn_r0 = -(hint>>2 & 1), /* FIXME: but it's SER_BYTES ... */
-        sgn_ed_T = -(hint>>3 & 1);
-    gf a, b, c, d;
-    deisogenize(a,c,p,sgn_s,sgn_t_over_s,sgn_ed_T);
-    
-    /* ok, a = s; c = -t/s */
-    gf_mul(b,c,a);
-    gf_sub(b,ONE,b); /* t+1 */
-    gf_sqr(c,a); /* s^2 */
-    mask_t is_identity = gf_eq(p->t,ZERO);
-    {
-        /* identity adjustments */
-        /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
-        /* if hint is 0, -> 0 */
-        /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
-        cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
-        cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s); /* identity adjust */        
-    }
-    gf_mulw_sgn(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
-    gf_add(a,b,d); /* num? */
-    gf_sub(d,d,b); /* den? */
-    gf_mul(b,a,d); /* n*d */
-    cond_sel(a,d,a,sgn_s);
-#if P_MOD_8 == 5
-    gf_mul(d,b,SQRT_MINUS_ONE);
-#else
-    gf_sub(d,ZERO,b);
-#endif
-    mask_t succ = gf_isr(c,d)|gf_eq(d,ZERO);
-    gf_mul(b,a,c);
-    cond_neg(b, sgn_r0^gf_hibit(b));
-    
-    succ &= ~(gf_eq(b,ZERO) & sgn_r0);
-#if COFACTOR == 8
-    succ &= ~(is_identity & sgn_ed_T); /* NB: there are no preimages of rotated identity. */
-#endif
-    
-    gf_serialize(recovered_hash,b,1); /* FIXME: ,0 */
-    /* TODO: deal with overflow flag */
-    return decaf_succeed_if(mask_to_bool(succ));
-}
-
-void API_NS(point_from_hash_uniform) (
-    point_t pt,
-    const unsigned char hashed_data[2*SER_BYTES]
-) {
-    point_t pt2;
-    API_NS(point_from_hash_nonuniform)(pt,hashed_data);
-    API_NS(point_from_hash_nonuniform)(pt2,&hashed_data[SER_BYTES]);
-    API_NS(point_add)(pt,pt,pt2);
-}
-
-decaf_error_t
-API_NS(invert_elligator_uniform) (
-    unsigned char partial_hash[2*SER_BYTES],
-    const point_t p,
-    uint16_t hint
-) {
-    point_t pt2;
-    API_NS(point_from_hash_nonuniform)(pt2,&partial_hash[SER_BYTES]);
-    API_NS(point_sub)(pt2,p,pt2);
-    return API_NS(invert_elligator_nonuniform)(partial_hash,pt2,hint);
-}
-
 decaf_bool_t API_NS(point_valid) (
     const point_t p
 ) {
@@ -1314,7 +801,7 @@ decaf_bool_t API_NS(point_valid) (
     gf_sqr(b,p->y);
     gf_sub(a,b,a);
     gf_sqr(b,p->t);
-    gf_mulw_sgn(c,b,TWISTED_D);
+    gf_mulw(c,b,TWISTED_D);
     gf_sqr(b,p->z);
     gf_add(b,b,c);
     out &= gf_eq(a,b);
@@ -1349,7 +836,7 @@ void API_NS(point_debugging_pscale) (
     gf gfac,tmp;
     /* NB this means you'll never pscale by negative numbers for p521 */
     ignore_result(gf_deserialize(gfac,factor,0));
-    cond_sel(gfac,gfac,ONE,gf_eq(gfac,ZERO));
+    gf_cond_sel(gfac,gfac,ONE,gf_eq(gfac,ZERO));
     gf_mul(tmp,p->x,gfac);
     gf_copy(q->x,tmp);
     gf_mul(tmp,p->y,gfac);
@@ -1498,7 +985,7 @@ void API_NS(precomputed_scalarmul) (
     
     scalar_t scalar1x;
     API_NS(scalar_add)(scalar1x, scalar, precomputed_scalarmul_adjustment);
-    sc_halve(scalar1x,scalar1x,sc_p);
+    API_NS(scalar_halve)(scalar1x,scalar1x);
     
     niels_t ni;
     
@@ -1543,15 +1030,6 @@ void API_NS(point_cond_sel) (
     constant_time_select(out,a,b,sizeof(point_t),bool_to_mask(pick_b),0);
 }
 
-void API_NS(scalar_cond_sel) (
-    scalar_t out,
-    const scalar_t a,
-    const scalar_t b,
-    decaf_bool_t pick_b
-) {
-    constant_time_select(out,a,b,sizeof(scalar_t),bool_to_mask(pick_b),sizeof(out->limb[0]));
-}
-
 /* FUTURE: restore Curve25519 Montgomery ladder? */
 decaf_error_t API_NS(direct_scalarmul) (
     uint8_t scaled[SER_BYTES],
@@ -1596,8 +1074,8 @@ decaf_error_t API_NS(x_direct_scalarmul) (
         k_t = -k_t; /* set to all 0s or all 1s */
         
         swap ^= k_t;
-        cond_swap(x2,x3,swap);
-        cond_swap(z2,z3,swap);
+        gf_cond_swap(x2,x3,swap);
+        gf_cond_swap(z2,z3,swap);
         swap = k_t;
         
         gf_add_nr(t1,x2,z2); /* A = x2 + z2 */
@@ -1617,14 +1095,14 @@ decaf_error_t API_NS(x_direct_scalarmul) (
         gf_mul(x2,z2,t1);    /* x2 = AA*BB */
         gf_sub_nr(t2,z2,t1); /* E = AA-BB */
         
-        gf_mulw_sgn(t1,t2,-EDWARDS_D); /* E*-d = a24*E */
+        gf_mulw(t1,t2,-EDWARDS_D); /* E*-d = a24*E */
         gf_add_nr(t1,t1,z2); /* AA + a24*E */
         gf_mul(z2,t2,t1); /* z2 = E(AA+a24*E) */
     }
     
     /* Finish */
-    cond_swap(x2,x3,swap);
-    cond_swap(z2,z3,swap);
+    gf_cond_swap(x2,x3,swap);
+    gf_cond_swap(z2,z3,swap);
     gf_invert(z2,z2);
     gf_mul(x1,x2,z2);
     gf_serialize(out,x1,1);
@@ -1668,13 +1146,13 @@ void API_NS(x_base_scalarmul) (
      * Jacobi -> Edwards -> Jacobi -> Montgomery,
      * we pick up only a factor of 2 over Jacobi -> Montgomery. 
      */
-    sc_halve(the_scalar,the_scalar,sc_p);
+    API_NS(scalar_halve)(the_scalar,the_scalar);
 #if COFACTOR==8
     /* If the base point isn't in the prime-order subgroup (PERF:
      * guarantee that it is?) then a 4-isogeny isn't necessarily
      * enough to clear the cofactor.  So add another doubling.
      */
-    sc_halve(the_scalar,the_scalar,sc_p);
+    API_NS(scalar_halve)(the_scalar,the_scalar);
 #endif
     point_t p;
     API_NS(precomputed_scalarmul)(p,API_NS(precomputed_base),the_scalar);
@@ -1781,6 +1259,7 @@ prepare_wnaf_table(
     }
     
     API_NS(point_destroy)(tmp);
+    decaf_bzero(twop,sizeof(twop));
 }
 
 extern const gf API_NS(precomputed_wnaf_as_fe)[];
diff --git a/src/per_curve/decaf.tmpl.h b/src/per_curve/decaf.tmpl.h
index f260256..1444f56 100644
--- a/src/per_curve/decaf.tmpl.h
+++ b/src/per_curve/decaf.tmpl.h
@@ -167,6 +167,16 @@ void $(c_ns)_scalar_mul (
     const $(c_ns)_scalar_t a,
     const $(c_ns)_scalar_t b
 ) API_VIS NONNULL3 NOINLINE;
+        
+/**
+* @brief Halve a scalar.  The scalars may use the same memory.
+* @param [in] a A scalar.
+* @param [out] out a/2.
+*/
+void $(c_ns)_scalar_halve (
+   $(c_ns)_scalar_t out,
+   const $(c_ns)_scalar_t a
+) API_VIS NONNULL2 NOINLINE;
 
 /**
  * @brief Invert a scalar.  When passed zero, return 0.  The input and output may alias.
diff --git a/src/per_curve/decaf.tmpl.hxx b/src/per_curve/decaf.tmpl.hxx
index fc8ca41..d0aa0e3 100644
--- a/src/per_curve/decaf.tmpl.hxx
+++ b/src/per_curve/decaf.tmpl.hxx
@@ -193,6 +193,9 @@ public:
     /** Divide by inverting q. If q == 0, return 0. */
     inline Scalar &operator/=(const Scalar &q) throw(CryptoException) { return *this *= q.inverse(); }
 
+    /** Return half this scalar.  Much faster than /2. */
+    inline Scalar half() const { Scalar out; $(c_ns)_scalar_halve(out.s,s); return out; }
+
     /** Compare in constant time */
     inline bool operator!=(const Scalar &q) const NOEXCEPT { return !(*this == q); }
 
diff --git a/src/per_curve/elligator.tmpl.c b/src/per_curve/elligator.tmpl.c
new file mode 100644
index 0000000..ee30c3a
--- /dev/null
+++ b/src/per_curve/elligator.tmpl.c
@@ -0,0 +1,149 @@
+/** @brief Elligator high-level functions. */
+
+#include "word.h"
+#include "field.h"
+#include <decaf.h>
+
+/* Template stuff */
+#define API_NS(_id) $(c_ns)_##_id
+#define point_t API_NS(point_t)
+#define IMAGINE_TWIST $(imagine_twist)
+#define COFACTOR $(cofactor)
+static const int EDWARDS_D = $(d);
+/* End of template stuff */
+
+extern void API_NS(deisogenize) (
+    gf_s *__restrict__ s,
+    gf_s *__restrict__ minus_t_over_s,
+    const point_t p,
+    mask_t toggle_hibit_s,
+    mask_t toggle_hibit_t_over_s,
+    mask_t toggle_rotation
+);
+
+void API_NS(point_from_hash_nonuniform) (
+    point_t p,
+    const unsigned char ser[SER_BYTES]
+) {
+    gf r0,r,a,b,c,N,e;
+    ignore_result(gf_deserialize(r0,ser,0));
+    gf_strong_reduce(r0);
+    gf_sqr(a,r0);
+    gf_mul_qnr(r,a);
+
+    /* Compute D@c := (dr+a-d)(dr-ar-d) with a=1 */
+    gf_sub(a,r,ONE);
+    gf_mulw(b,a,EDWARDS_D); /* dr-d */
+    gf_add(a,b,ONE);
+    gf_sub(b,b,r);
+    gf_mul(c,a,b);
+    
+    /* compute N := (r+1)(a-2d) */
+    gf_add(a,r,ONE);
+    gf_mulw(N,a,1-2*EDWARDS_D);
+    
+    /* e = +-sqrt(1/ND) or +-r0 * sqrt(qnr/ND) */
+    gf_mul(a,c,N);
+    mask_t square = gf_isr(b,a);
+    gf_cond_sel(c,r0,ONE,square); /* r? = square ? 1 : r0 */
+    gf_mul(e,b,c);
+    
+    /* s@a = +-|N.e| */
+    gf_mul(a,N,e);
+    gf_cond_neg(a,gf_hibit(a)^square); /* NB this is - what is listen in the paper */
+    
+    /* t@b = -+ cN(r-1)((a-2d)e)^2 - 1 */
+    gf_mulw(c,e,1-2*EDWARDS_D); /* (a-2d)e */
+    gf_sqr(b,c);
+    gf_sub(e,r,ONE);
+    gf_mul(c,b,e);
+    gf_mul(b,c,N);
+    gf_cond_neg(b,square);
+    gf_sub(b,b,ONE);
+
+    /* isogenize */
+#if IMAGINE_TWIST
+    gf_mul(c,a,SQRT_MINUS_ONE);
+    gf_copy(a,c);
+#endif
+    
+    gf_sqr(c,a); /* s^2 */
+    gf_add(a,a,a); /* 2s */
+    gf_add(e,c,ONE);
+    gf_mul(p->t,a,e); /* 2s(1+s^2) */
+    gf_mul(p->x,a,b); /* 2st */
+    gf_sub(a,ONE,c);
+    gf_mul(p->y,e,a); /* (1+s^2)(1-s^2) */
+    gf_mul(p->z,a,b); /* (1-s^2)t */
+    
+    assert(API_NS(point_valid)(p));
+}
+
+void API_NS(point_from_hash_uniform) (
+    point_t pt,
+    const unsigned char hashed_data[2*SER_BYTES]
+) {
+    point_t pt2;
+    API_NS(point_from_hash_nonuniform)(pt,hashed_data);
+    API_NS(point_from_hash_nonuniform)(pt2,&hashed_data[SER_BYTES]);
+    API_NS(point_add)(pt,pt,pt2);
+}
+
+decaf_error_t
+API_NS(invert_elligator_nonuniform) (
+    unsigned char recovered_hash[SER_BYTES],
+    const point_t p,
+    uint16_t hint_
+) {
+    mask_t hint = hint_;
+    mask_t sgn_s = -(hint & 1),
+        sgn_t_over_s = -(hint>>1 & 1),
+        sgn_r0 = -(hint>>2 & 1), /* FIXME: but it's SER_BYTES ... */
+        sgn_ed_T = -(hint>>3 & 1);
+    gf a, b, c, d;
+    API_NS(deisogenize)(a,c,p,sgn_s,sgn_t_over_s,sgn_ed_T);
+    
+    /* ok, a = s; c = -t/s */
+    gf_mul(b,c,a);
+    gf_sub(b,ONE,b); /* t+1 */
+    gf_sqr(c,a); /* s^2 */
+    mask_t is_identity = gf_eq(p->t,ZERO);
+
+    /* identity adjustments */
+    /* in case of identity, currently c=0, t=0, b=1, will encode to 1 */
+    /* if hint is 0, -> 0 */
+    /* if hint is to neg t/s, then go to infinity, effectively set s to 1 */
+    gf_cond_sel(c,c,ONE,is_identity & sgn_t_over_s);
+    gf_cond_sel(b,b,ZERO,is_identity & ~sgn_t_over_s & ~sgn_s);
+        
+    gf_mulw(d,c,2*EDWARDS_D-1); /* $d = (2d-a)s^2 */
+    gf_add(a,b,d); /* num? */
+    gf_sub(d,d,b); /* den? */
+    gf_mul(b,a,d); /* n*d */
+    gf_cond_sel(a,d,a,sgn_s);
+    gf_mul_qnr(d,b);
+    mask_t succ = gf_isr(c,d)|gf_eq(d,ZERO);
+    gf_mul(b,a,c);
+    gf_cond_neg(b, sgn_r0^gf_hibit(b));
+    
+    succ &= ~(gf_eq(b,ZERO) & sgn_r0);
+#if COFACTOR == 8
+    succ &= ~(is_identity & sgn_ed_T); /* NB: there are no preimages of rotated identity. */
+#endif
+    
+    gf_serialize(recovered_hash,b,1); /* FIXME: ,0 */
+    /* TODO: deal with overflow flag */
+    return decaf_succeed_if(mask_to_bool(succ));
+}
+
+decaf_error_t
+API_NS(invert_elligator_uniform) (
+    unsigned char partial_hash[2*SER_BYTES],
+    const point_t p,
+    uint16_t hint
+) {
+    point_t pt2;
+    API_NS(point_from_hash_nonuniform)(pt2,&partial_hash[SER_BYTES]);
+    API_NS(point_sub)(pt2,p,pt2);
+    return API_NS(invert_elligator_nonuniform)(partial_hash,pt2,hint);
+}
diff --git a/src/per_curve/scalar.tmpl.c b/src/per_curve/scalar.tmpl.c
new file mode 100644
index 0000000..78bb2f6
--- /dev/null
+++ b/src/per_curve/scalar.tmpl.c
@@ -0,0 +1,328 @@
+/** @brief Decaf high-level functions. */
+
+#include "word.h"
+#include "constant_time.h"
+#include <decaf.h>
+
+/* Template stuff */
+#define API_NS(_id) $(c_ns)_##_id
+#define SCALAR_BITS $(C_NS)_SCALAR_BITS
+#define SCALAR_SER_BYTES $(C_NS)_SCALAR_BYTES
+#define SCALAR_LIMBS $(C_NS)_SCALAR_LIMBS
+#define scalar_t API_NS(scalar_t)
+
+static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x$("%x" % pow(-q,2**64-1,2**64))ull;
+static const scalar_t sc_p = {{{
+    $(ser(q,64,"SC_LIMB"))
+}}}, sc_r2 = {{{
+    $(ser(((2**128)**((scalar_bits+63)/64))%q,64,"SC_LIMB"))
+}}};
+/* End of template stuff */
+
+#define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
+
+const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
+
+/** {extra,accum} - sub +? p
+ * Must have extra <= 1
+ */
+static NOINLINE void sc_subx(
+    scalar_t out,
+    const decaf_word_t accum[SCALAR_LIMBS],
+    const scalar_t sub,
+    const scalar_t p,
+    decaf_word_t extra
+) {
+    decaf_dsword_t chain = 0;
+    unsigned int i;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        chain = (chain + accum[i]) - sub->limb[i];
+        out->limb[i] = chain;
+        chain >>= WBITS;
+    }
+    decaf_word_t borrow = chain+extra; /* = 0 or -1 */
+    
+    chain = 0;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
+        out->limb[i] = chain;
+        chain >>= WBITS;
+    }
+}
+
+static NOINLINE void sc_montmul (
+    scalar_t out,
+    const scalar_t a,
+    const scalar_t b
+) {
+    unsigned int i,j;
+    decaf_word_t accum[SCALAR_LIMBS+1] = {0};
+    decaf_word_t hi_carry = 0;
+    
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        decaf_word_t mand = a->limb[i];
+        const decaf_word_t *mier = b->limb;
+        
+        decaf_dword_t chain = 0;
+        for (j=0; j<SCALAR_LIMBS; j++) {
+            chain += ((decaf_dword_t)mand)*mier[j] + accum[j];
+            accum[j] = chain;
+            chain >>= WBITS;
+        }
+        accum[j] = chain;
+        
+        mand = accum[0] * MONTGOMERY_FACTOR;
+        chain = 0;
+        mier = sc_p->limb;
+        for (j=0; j<SCALAR_LIMBS; j++) {
+            chain += (decaf_dword_t)mand*mier[j] + accum[j];
+            if (j) accum[j-1] = chain;
+            chain >>= WBITS;
+        }
+        chain += accum[j];
+        chain += hi_carry;
+        accum[j-1] = chain;
+        hi_carry = chain >> WBITS;
+    }
+    
+    sc_subx(out, accum, sc_p, sc_p, hi_carry);
+}
+
+void API_NS(scalar_mul) (
+    scalar_t out,
+    const scalar_t a,
+    const scalar_t b
+) {
+    sc_montmul(out,a,b);
+    sc_montmul(out,out,sc_r2);
+}
+
+/* PERF: could implement this */
+static INLINE void sc_montsqr (scalar_t out, const scalar_t a) {
+    sc_montmul(out,a,a);
+}
+
+decaf_error_t API_NS(scalar_invert) (
+    scalar_t out,
+    const scalar_t a
+) {
+    /* Fermat's little theorem, sliding window.
+     * Sliding window is fine here because the modulus isn't secret.
+     */
+    const int SCALAR_WINDOW_BITS = 3;
+    scalar_t precmp[1<<SCALAR_WINDOW_BITS];
+    const int LAST = (1<<SCALAR_WINDOW_BITS)-1;
+
+    /* Precompute precmp = [a^1,a^3,...] */
+    sc_montmul(precmp[0],a,sc_r2);
+    if (LAST > 0) sc_montmul(precmp[LAST],precmp[0],precmp[0]);
+
+    int i;
+    for (i=1; i<=LAST; i++) {
+        sc_montmul(precmp[i],precmp[i-1],precmp[LAST]);
+    }
+    
+    /* Sliding window */
+    unsigned residue = 0, trailing = 0, started = 0;
+    for (i=SCALAR_BITS-1; i>=-SCALAR_WINDOW_BITS; i--) {
+        
+        if (started) sc_montsqr(out,out);
+        
+        decaf_word_t w = (i>=0) ? sc_p->limb[i/WBITS] : 0;
+        if (i >= 0 && i<WBITS) {
+            assert(w >= 2);
+            w-=2;
+        }
+        
+        residue = (residue<<1) | ((w>>(i%WBITS))&1);
+        if (residue>>SCALAR_WINDOW_BITS != 0) {
+            assert(trailing == 0);
+            trailing = residue;
+            residue = 0;
+        }
+        
+        if (trailing > 0 && (trailing & ((1<<SCALAR_WINDOW_BITS)-1)) == 0) {
+            if (started) {
+                sc_montmul(out,out,precmp[trailing>>(SCALAR_WINDOW_BITS+1)]);
+            } else {
+                API_NS(scalar_copy)(out,precmp[trailing>>(SCALAR_WINDOW_BITS+1)]);
+                started = 1;
+            }
+            trailing = 0;
+        }
+        trailing <<= 1;
+        
+    }
+    assert(residue==0);
+    assert(trailing==0);
+    
+    /* Demontgomerize */
+    sc_montmul(out,out,API_NS(scalar_one));
+    decaf_bzero(precmp, sizeof(precmp));
+    return decaf_succeed_if(~API_NS(scalar_eq)(out,API_NS(scalar_zero)));
+}
+
+void API_NS(scalar_sub) (
+    scalar_t out,
+    const scalar_t a,
+    const scalar_t b
+) {
+    sc_subx(out, a->limb, b, sc_p, 0);
+}
+
+void API_NS(scalar_add) (
+    scalar_t out,
+    const scalar_t a,
+    const scalar_t b
+) {
+    decaf_dword_t chain = 0;
+    unsigned int i;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        chain = (chain + a->limb[i]) + b->limb[i];
+        out->limb[i] = chain;
+        chain >>= WBITS;
+    }
+    sc_subx(out, out->limb, sc_p, sc_p, chain);
+}
+
+void
+API_NS(scalar_set_unsigned) (
+    scalar_t out,
+    uint64_t w
+) {
+    memset(out,0,sizeof(scalar_t));
+    unsigned int i = 0;
+    for (; i<sizeof(uint64_t)/sizeof(decaf_word_t); i++) {
+        out->limb[i] = w;
+        w >>= (sizeof(uint64_t) > sizeof(decaf_word_t)) ? 8*sizeof(decaf_word_t) : 0;
+    }
+}
+
+decaf_bool_t
+API_NS(scalar_eq) (
+    const scalar_t a,
+    const scalar_t b
+) {
+    decaf_word_t diff = 0;
+    unsigned int i;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        diff |= a->limb[i] ^ b->limb[i];
+    }
+    return mask_to_bool(word_is_zero(diff));
+}
+
+static INLINE void scalar_decode_short (
+    scalar_t s,
+    const unsigned char *ser,
+    unsigned int nbytes
+) {
+    unsigned int i,j,k=0;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        decaf_word_t out = 0;
+        for (j=0; j<sizeof(decaf_word_t) && k<nbytes; j++,k++) {
+            out |= ((decaf_word_t)ser[k])<<(8*j);
+        }
+        s->limb[i] = out;
+    }
+}
+
+decaf_error_t API_NS(scalar_decode)(
+    scalar_t s,
+    const unsigned char ser[SCALAR_SER_BYTES]
+) {
+    unsigned int i;
+    scalar_decode_short(s, ser, SCALAR_SER_BYTES);
+    decaf_dsword_t accum = 0;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS;
+    }
+    /* Here accum == 0 or -1 */
+    
+    API_NS(scalar_mul)(s,s,API_NS(scalar_one)); /* ham-handed reduce */
+    
+    return decaf_succeed_if(~word_is_zero(accum));
+}
+
+void API_NS(scalar_destroy) (
+    scalar_t scalar
+) {
+    decaf_bzero(scalar, sizeof(scalar_t));
+}
+
+void API_NS(scalar_decode_long)(
+    scalar_t s,
+    const unsigned char *ser,
+    size_t ser_len
+) {
+    if (ser_len == 0) {
+        API_NS(scalar_copy)(s, API_NS(scalar_zero));
+        return;
+    }
+    
+    size_t i;
+    scalar_t t1, t2;
+
+    i = ser_len - (ser_len%SCALAR_SER_BYTES);
+    if (i==ser_len) i -= SCALAR_SER_BYTES;
+    
+    scalar_decode_short(t1, &ser[i], ser_len-i);
+
+    if (ser_len == sizeof(scalar_t)) {
+        assert(i==0);
+        /* ham-handed reduce */
+        API_NS(scalar_mul)(s,t1,API_NS(scalar_one));
+        API_NS(scalar_destroy)(t1);
+        return;
+    }
+
+    while (i) {
+        i -= SCALAR_SER_BYTES;
+        sc_montmul(t1,t1,sc_r2);
+        ignore_result( API_NS(scalar_decode)(t2, ser+i) );
+        API_NS(scalar_add)(t1, t1, t2);
+    }
+
+    API_NS(scalar_copy)(s, t1);
+    API_NS(scalar_destroy)(t1);
+    API_NS(scalar_destroy)(t2);
+}
+
+void API_NS(scalar_encode)(
+    unsigned char ser[SCALAR_SER_BYTES],
+    const scalar_t s
+) {
+    unsigned int i,j,k=0;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        for (j=0; j<sizeof(decaf_word_t); j++,k++) {
+            ser[k] = s->limb[i] >> (8*j);
+        }
+    }
+}
+
+void API_NS(scalar_cond_sel) (
+    scalar_t out,
+    const scalar_t a,
+    const scalar_t b,
+    decaf_bool_t pick_b
+) {
+    constant_time_select(out,a,b,sizeof(scalar_t),bool_to_mask(pick_b),sizeof(out->limb[0]));
+}
+
+void API_NS(scalar_halve) (
+    scalar_t out,
+    const scalar_t a
+) {
+    decaf_word_t mask = -(a->limb[0] & 1);
+    decaf_dword_t chain = 0;
+    unsigned int i;
+    for (i=0; i<SCALAR_LIMBS; i++) {
+        chain = (chain + a->limb[i]) + (sc_p->limb[i] & mask);
+        out->limb[i] = chain;
+        chain >>= DECAF_WORD_BITS;
+    }
+    for (i=0; i<SCALAR_LIMBS-1; i++) {
+        out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1);
+    }
+    out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1);
+}
+
diff --git a/src/per_field/f_field.tmpl.h b/src/per_field/f_field.tmpl.h
index d03c5ad..4faf9b8 100644
--- a/src/per_field/f_field.tmpl.h
+++ b/src/per_field/f_field.tmpl.h
@@ -33,7 +33,7 @@ typedef struct gf_$(gf_shortname)_s {
 #define gf_strong_reduce  gf_$(gf_shortname)_strong_reduce
 #define gf_mul            gf_$(gf_shortname)_mul
 #define gf_sqr            gf_$(gf_shortname)_sqr
-#define gf_mulw           gf_$(gf_shortname)_mulw
+#define gf_mulw_unsigned  gf_$(gf_shortname)_mulw_unsigned
 #define gf_isr            gf_$(gf_shortname)_isr
 #define gf_serialize      gf_$(gf_shortname)_serialize
 #define gf_deserialize    gf_$(gf_shortname)_deserialize
@@ -62,7 +62,7 @@ void gf_strong_reduce (gf inout);
 void gf_add (gf out, const gf a, const gf b);
 void gf_sub (gf out, const gf a, const gf b);
 void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
-void gf_mulw (gf_s *__restrict__ out, const gf a, uint32_t b);
+void gf_mulw_unsigned (gf_s *__restrict__ out, const gf a, uint32_t b);
 void gf_sqr (gf_s *__restrict__ out, const gf a);
 mask_t gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0.  Return true if successful */
 mask_t gf_eq (const gf x, const gf y);