From 4218223dd730034e5c47c1d6832a9ed8455bf484 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Thu, 14 Jan 2016 16:36:30 -0800
Subject: [PATCH] generate most of f_impl.h.  Not tested on most arches yet :-(

---
 src/gen_headers/f_field_h.py             |  40 +++++++--
 src/p25519/arch_ref64/f_impl.c           |   2 +-
 src/p25519/arch_ref64/f_impl.h           |  78 -----------------
 src/p25519/arch_x86_64/f_impl.c          |   2 +-
 src/p25519/arch_x86_64/f_impl.h          |  90 --------------------
 src/p448/arch_32/f_impl.c                |   3 +-
 src/p448/arch_32/f_impl.h                |  76 -----------------
 src/p448/arch_arm_32/f_impl.c            |   3 +-
 src/p448/arch_arm_32/f_impl.h            |  76 -----------------
 src/p448/arch_neon_experimental/f_impl.c |   3 +-
 src/p448/arch_neon_experimental/f_impl.h |  78 +----------------
 src/p448/arch_ref64/f_impl.c             |   2 +-
 src/p448/arch_ref64/f_impl.h             |  76 -----------------
 src/p448/arch_x86_64/f_impl.c            |   2 +-
 src/p448/arch_x86_64/f_impl.h            |  79 -----------------
 src/p480/arch_x86_64/f_impl.c            |  41 +++++----
 src/p480/arch_x86_64/f_impl.h            | 100 +++++++++-------------
 src/p521/arch_ref64/f_impl.c             |  40 ++++-----
 src/p521/arch_ref64/f_impl.h             |  98 +++++++++------------
 src/p521/arch_x86_64_r12/f_impl.c        |  40 ++++-----
 src/p521/arch_x86_64_r12/f_impl.h        | 104 +++--------------------
 21 files changed, 196 insertions(+), 837 deletions(-)

diff --git a/src/gen_headers/f_field_h.py b/src/gen_headers/f_field_h.py
index 85e45f8..8a01e48 100644
--- a/src/gen_headers/f_field_h.py
+++ b/src/gen_headers/f_field_h.py
@@ -9,22 +9,52 @@ f_field_h = gen_file(
 #include "constant_time.h"
 #include <string.h>
 
-#include "f_impl.h"
+
+#include "decaf/decaf_%(gf_bits)s.h" /* HACK in genheader */
+#include "word.h"
+
 #define GF_LIT_LIMB_BITS  %(gf_lit_limb_bits)d
 #define GF_BITS           %(gf_bits)d
 #define gf                gf_%(gf_shortname)s_t
 #define gf_s              gf_%(gf_shortname)s_s
-#define gf_mul            gf_%(gf_shortname)s_mul
-#define gf_sqr            gf_%(gf_shortname)s_sqr
+#define gf_copy           gf_%(gf_shortname)s_copy
 #define gf_add_RAW        gf_%(gf_shortname)s_add_RAW
 #define gf_sub_RAW        gf_%(gf_shortname)s_sub_RAW
-#define gf_mulw           gf_%(gf_shortname)s_mulw
 #define gf_bias           gf_%(gf_shortname)s_bias
-#define gf_isr            gf_%(gf_shortname)s_isr
 #define gf_weak_reduce    gf_%(gf_shortname)s_weak_reduce
 #define gf_strong_reduce  gf_%(gf_shortname)s_strong_reduce
+#define gf_mul            gf_%(gf_shortname)s_mul
+#define gf_sqr            gf_%(gf_shortname)s_sqr
+#define gf_mulw           gf_%(gf_shortname)s_mulw
+#define gf_isr            gf_%(gf_shortname)s_isr
 #define gf_serialize      gf_%(gf_shortname)s_serialize
 #define gf_deserialize    gf_%(gf_shortname)s_deserialize
 
 #define SQRT_MINUS_ONE    P%(gf_shortname)s_SQRT_MINUS_ONE /* might not be defined */
+
+#define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Defined below in f_impl.h */
+static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; }
+static INLINE_UNUSED void gf_add_RAW (gf out, const gf a, const gf b);
+static INLINE_UNUSED void gf_sub_RAW (gf out, const gf a, const gf b);
+static INLINE_UNUSED void gf_bias (gf inout, int amount);
+static INLINE_UNUSED void gf_weak_reduce (gf inout);
+
+void gf_strong_reduce (gf inout);   
+void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
+void gf_mulw (gf_s *__restrict__ out, const gf a, uint64_t b);
+void gf_sqr (gf_s *__restrict__ out, const gf a);
+void gf_serialize (uint8_t *serial, const gf x);
+mask_t gf_deserialize (gf x, const uint8_t serial[(GF_BITS-1)/8+1]);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#include "f_impl.h" /* Bring in the inline implementations */
 """)
\ No newline at end of file
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index d58ba73..8f24012 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
+#include "f_field.h"
 
 static __inline__ __uint128_t widemul(
     const uint64_t a,
diff --git a/src/p25519/arch_ref64/f_impl.h b/src/p25519/arch_ref64/f_impl.h
index 1cb39ce..835151a 100644
--- a/src/p25519/arch_ref64/f_impl.h
+++ b/src/p25519/arch_ref64/f_impl.h
@@ -14,88 +14,10 @@
 #define LBITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
 
-/*
-#define FIELD_LITERAL(a,b,c,d) {{ \
-    (a##ull) & LMASK, \
-    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
-    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
-    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
-    (d##ull)>>12 \
-}}
-*/
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static __inline__ void
-gf_25519_add_RAW (
-    gf_25519_t out,
-    const gf_25519_t a,
-    const gf_25519_t b
-) __attribute__((unused));
-             
-static __inline__ void
-gf_25519_sub_RAW (
-    gf_25519_t out,
-    const gf_25519_t a,
-    const gf_25519_t b
-) __attribute__((unused));
-             
-static __inline__ void
-gf_25519_copy (
-    gf_25519_t out,
-    const gf_25519_t a
-) __attribute__((unused));
-             
-static __inline__ void
-gf_25519_weak_reduce (
-    gf_25519_t inout
-) __attribute__((unused));
-             
-void
-gf_25519_strong_reduce (
-    gf_25519_t inout
-);
-
-static __inline__ void
-gf_25519_bias (
-    gf_25519_t inout,
-    int amount
-) __attribute__((unused));
-         
-void
-gf_25519_mul (
-    gf_25519_s *__restrict__ out,
-    const gf_25519_t a,
-    const gf_25519_t b
-);
-
-void
-gf_25519_mulw (
-    gf_25519_s *__restrict__ out,
-    const gf_25519_t a,
-    uint64_t b
-);
-
-void
-gf_25519_sqr (
-    gf_25519_s *__restrict__ out,
-    const gf_25519_t a
-);
-
-void
-gf_25519_serialize (
-    uint8_t serial[32],
-    const gf_25519_t x
-);
-
-mask_t
-gf_25519_deserialize (
-    gf_25519_t x,
-    const uint8_t serial[32]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 void
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index f5ea715..377252c 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
+#include "f_field.h"
 #include "x86-64-arith.h"
 
 static inline uint64_t shr(__uint128_t x, int n) {
diff --git a/src/p25519/arch_x86_64/f_impl.h b/src/p25519/arch_x86_64/f_impl.h
index a90702d..24cbe19 100644
--- a/src/p25519/arch_x86_64/f_impl.h
+++ b/src/p25519/arch_x86_64/f_impl.h
@@ -14,88 +14,6 @@
 #define DECAF_255_LIMB_BITS 51
 #define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
 
-/*
-#define FIELD_LITERAL(a,b,c,d) {{ \
-    (a##ull) & LMASK, \
-    ((a##ull)>>51 | (b##ull)<<13) & LMASK, \
-    ((b##ull)>>38 | (c##ull)<<26) & LMASK, \
-    ((c##ull)>>25 | (d##ull)<<39) & LMASK, \
-    (d##ull)>>12 \
-}}
-*/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static __inline__ void
-gf_25519_add_RAW (
-    gf_25519_t out,
-    const gf_25519_t a,
-    const gf_25519_t b
-) __attribute__((unused));
-             
-static __inline__ void
-gf_25519_sub_RAW (
-    gf_25519_t out,
-    const gf_25519_t a,
-    const gf_25519_t b
-) __attribute__((unused));
-             
-static __inline__ void
-gf_25519_copy (
-    gf_25519_t out,
-    const gf_25519_t a
-) __attribute__((unused));
-             
-static __inline__ void
-gf_25519_weak_reduce (
-    gf_25519_t inout
-) __attribute__((unused));
-             
-void
-gf_25519_strong_reduce (
-    gf_25519_t inout
-);
-
-static __inline__ void
-gf_25519_bias (
-    gf_25519_t inout,
-    int amount
-) __attribute__((unused));
-         
-void
-gf_25519_mul (
-    gf_25519_s *__restrict__ out,
-    const gf_25519_t a,
-    const gf_25519_t b
-);
-
-void
-gf_25519_mulw (
-    gf_25519_s *__restrict__ out,
-    const gf_25519_t a,
-    uint64_t b
-);
-
-void
-gf_25519_sqr (
-    gf_25519_s *__restrict__ out,
-    const gf_25519_t a
-);
-
-void
-gf_25519_serialize (
-    uint8_t serial[32],
-    const gf_25519_t x
-);
-
-mask_t
-gf_25519_deserialize (
-    gf_25519_t x,
-    const uint8_t serial[32]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 void
@@ -123,14 +41,6 @@ gf_25519_sub_RAW (
     }
 }
 
-void
-gf_25519_copy (
-    gf_25519_t out,
-    const gf_25519_t a
-) {
-    memcpy(out,a,sizeof(*a));
-}
-
 void
 gf_25519_bias (
     gf_25519_t a,
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index bedaf38..bd900c6 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -2,8 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "word.h"
-#include "f_impl.h"
+#include "f_field.h"
 
 static inline mask_t __attribute__((always_inline))
 is_zero (
diff --git a/src/p448/arch_32/f_impl.h b/src/p448/arch_32/f_impl.h
index ec7a3d0..7d343e0 100644
--- a/src/p448/arch_32/f_impl.h
+++ b/src/p448/arch_32/f_impl.h
@@ -22,74 +22,6 @@ typedef struct gf_448_s {
 extern "C" {
 #endif
 
-static __inline__ void
-gf_448_add_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_sub_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_weak_reduce (
-    gf_448_t inout
-) __attribute__((unused,always_inline));
-             
-void
-gf_448_strong_reduce (
-    gf_448_t inout
-);
-             
-static __inline__ void
-gf_448_bias (
-    gf_448_t inout,
-    int amount
-) __attribute__((unused,always_inline));
-
-void
-gf_448_mul (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    const gf_448_t b
-);
-
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    uint64_t b
-);
-
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a
-);
-
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-);
-
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 void
@@ -128,14 +60,6 @@ gf_448_sub_RAW (
     */
 }
 
-void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) {
-  *out = *a;
-}
-
 void
 gf_448_bias (
     gf_448_t a,
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index 7e7150c..ea831f3 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -2,8 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "word.h"
-#include "f_impl.h"
+#include "f_field.h"
 
 static inline mask_t __attribute__((always_inline))
 is_zero (
diff --git a/src/p448/arch_arm_32/f_impl.h b/src/p448/arch_arm_32/f_impl.h
index ec7a3d0..7d343e0 100644
--- a/src/p448/arch_arm_32/f_impl.h
+++ b/src/p448/arch_arm_32/f_impl.h
@@ -22,74 +22,6 @@ typedef struct gf_448_s {
 extern "C" {
 #endif
 
-static __inline__ void
-gf_448_add_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_sub_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_weak_reduce (
-    gf_448_t inout
-) __attribute__((unused,always_inline));
-             
-void
-gf_448_strong_reduce (
-    gf_448_t inout
-);
-             
-static __inline__ void
-gf_448_bias (
-    gf_448_t inout,
-    int amount
-) __attribute__((unused,always_inline));
-
-void
-gf_448_mul (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    const gf_448_t b
-);
-
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    uint64_t b
-);
-
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a
-);
-
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-);
-
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 void
@@ -128,14 +60,6 @@ gf_448_sub_RAW (
     */
 }
 
-void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) {
-  *out = *a;
-}
-
 void
 gf_448_bias (
     gf_448_t a,
diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon_experimental/f_impl.c
index 9282cb4..002ef40 100644
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -2,8 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "word.h"
-#include "f_impl.h"
+#include "f_field.h"
 
 static inline mask_t __attribute__((always_inline))
 is_zero (
diff --git a/src/p448/arch_neon_experimental/f_impl.h b/src/p448/arch_neon_experimental/f_impl.h
index 2b3894d..a88dec2 100644
--- a/src/p448/arch_neon_experimental/f_impl.h
+++ b/src/p448/arch_neon_experimental/f_impl.h
@@ -27,75 +27,7 @@ typedef struct gf_448_s {
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-static __inline__ void
-gf_448_add_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_sub_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_weak_reduce (
-    gf_448_t inout
-) __attribute__((unused,always_inline));
-             
-void
-gf_448_strong_reduce (
-    gf_448_t inout
-);
-             
-static __inline__ void
-gf_448_bias (
-    gf_448_t inout,
-    int amount
-) __attribute__((unused,always_inline));
-
-void
-gf_448_mul (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    const gf_448_t b
-);
-
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    uint64_t b
-);
-
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a
-);
-
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-);
-
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-);
-
+    
 /* -------------- Inline functions begin here -------------- */
 
 void
@@ -128,14 +60,6 @@ gf_448_sub_RAW (
     */
 }
 
-void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) {
-  *out = *a;
-}
-
 void
 gf_448_bias (
     gf_448_t a,
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index c03913d..88bef61 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
+#include "f_field.h"
 
 static __inline__ __uint128_t widemul(
     const uint64_t a,
diff --git a/src/p448/arch_ref64/f_impl.h b/src/p448/arch_ref64/f_impl.h
index 65add03..cf84d72 100644
--- a/src/p448/arch_ref64/f_impl.h
+++ b/src/p448/arch_ref64/f_impl.h
@@ -21,74 +21,6 @@ typedef struct gf_448_s {
 extern "C" {
 #endif
 
-static __inline__ void
-gf_448_add_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused));
-             
-static __inline__ void
-gf_448_sub_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused));
-             
-static __inline__ void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) __attribute__((unused));
-             
-static __inline__ void
-gf_448_weak_reduce (
-    gf_448_t inout
-) __attribute__((unused));
-             
-void
-gf_448_strong_reduce (
-    gf_448_t inout
-);
-
-static __inline__ void
-gf_448_bias (
-    gf_448_t inout,
-    int amount
-) __attribute__((unused));
-         
-void
-gf_448_mul (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    const gf_448_t b
-);
-
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    uint64_t b
-);
-
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a
-);
-
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-);
-
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 void
@@ -118,14 +50,6 @@ gf_448_sub_RAW (
     gf_448_weak_reduce(out);
 }
 
-void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) {
-    memcpy(out,a,sizeof(*a));
-}
-
 void
 gf_448_bias (
     gf_448_t a,
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index e044942..9c02d84 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -2,7 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
+#include "f_field.h"
 #include "x86-64-arith.h"
 
 void
diff --git a/src/p448/arch_x86_64/f_impl.h b/src/p448/arch_x86_64/f_impl.h
index 34da14b..0593398 100644
--- a/src/p448/arch_x86_64/f_impl.h
+++ b/src/p448/arch_x86_64/f_impl.h
@@ -17,74 +17,6 @@
 extern "C" {
 #endif
 
-static __inline__ void
-gf_448_add_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_sub_RAW (
-    gf_448_t out,
-    const gf_448_t a,
-    const gf_448_t b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-gf_448_weak_reduce (
-    gf_448_t inout
-) __attribute__((unused,always_inline));
-             
-void
-gf_448_strong_reduce (
-    gf_448_t inout
-);
-
-static __inline__ void
-gf_448_bias (
-    gf_448_t inout,
-    int amount
-) __attribute__((unused,always_inline));
-         
-void
-gf_448_mul (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    const gf_448_t b
-);
-
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a,
-    uint64_t b
-);
-
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ out,
-    const gf_448_t a
-);
-
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-);
-
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 void
@@ -123,17 +55,6 @@ gf_448_sub_RAW (
     */
 }
 
-void
-gf_448_copy (
-    gf_448_t out,
-    const gf_448_t a
-) {
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
-        ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
-    }
-}
-
 void
 gf_448_bias (
     gf_448_t a,
diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c
index db7823f..7aea1f0 100644
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -2,14 +2,13 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
-#include "x86-64-arith.h"
+#include "f_field.h"
 
 void
-p480_mul (
-    p480_t *__restrict__ cs,
-    const p480_t *as,
-    const p480_t *bs
+gf_480_mul (
+    gf_480_t *__restrict__ cs,
+    const gf_480_t *as,
+    const gf_480_t *bs
 ) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
@@ -146,9 +145,9 @@ p480_mul (
 }
 
 void
-p480_mulw (
-    p480_t *__restrict__ cs,
-    const p480_t *as,
+gf_480_mulw (
+    gf_480_t *__restrict__ cs,
+    const gf_480_t *as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb;
@@ -191,9 +190,9 @@ p480_mulw (
 }
 
 void
-p480_sqr (
-    p480_t *__restrict__ cs,
-    const p480_t *as
+gf_480_sqr (
+    gf_480_t *__restrict__ cs,
+    const gf_480_t *as
 ) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
@@ -306,8 +305,8 @@ p480_sqr (
 }
 
 void
-p480_strong_reduce (
-    p480_t *a
+gf_480_strong_reduce (
+    gf_480_t *a
 ) {
     uint64_t mask = (1ull<<60)-1;
 
@@ -349,14 +348,14 @@ p480_strong_reduce (
 }
 
 void
-p480_serialize (
+gf_480_serialize (
     uint8_t *serial,
-    const struct p480_t *x
+    const struct gf_480_t *x
 ) {
     int i,j,k=0;
-    p480_t red;
-    p480_copy(&red, x);
-    p480_strong_reduce(&red);
+    gf_480_t red;
+    gf_480_copy(&red, x);
+    gf_480_strong_reduce(&red);
     word_t r = 0;
     for (i=0; i<8; i+=2) {
         r = red.limb[i];
@@ -375,8 +374,8 @@ p480_serialize (
 }
 
 mask_t
-p480_deserialize (
-    p480_t *x,
+gf_480_deserialize (
+    gf_480_t *x,
     const uint8_t serial[60]
 ) {
     int i,j,k=0;
diff --git a/src/p480/arch_x86_64/f_impl.h b/src/p480/arch_x86_64/f_impl.h
index c309200..b55ba97 100644
--- a/src/p480/arch_x86_64/f_impl.h
+++ b/src/p480/arch_x86_64/f_impl.h
@@ -1,97 +1,77 @@
 /* Copyright (c) 2014 Cryptography Research, Inc.
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
-#ifndef __p480_H__
-#define __p480_H__ 1
+#ifndef __gf_480_H__
+#define __gf_480_H__ 1
 
 #include <stdint.h>
 #include <assert.h>
 
 #include "word.h"
 
-typedef struct p480_t {
+typedef struct gf_480_t {
   uint64_t limb[8];
-} __attribute__((aligned(32))) p480_t;
+} __attribute__((aligned(32))) gf_480_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-static __inline__ void
-p480_add_RAW (
-    p480_t *out,
-    const p480_t *a,
-    const p480_t *b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p480_sub_RAW (
-    p480_t *out,
-    const p480_t *a,
-    const p480_t *b
-) __attribute__((unused,always_inline));
-             
-static __inline__ void
-p480_copy (
-    p480_t *out,
-    const p480_t *a
-) __attribute__((unused,always_inline));
              
 static __inline__ void
-p480_weak_reduce (
-    p480_t *inout
+gf_480_weak_reduce (
+    gf_480_t *inout
 ) __attribute__((unused,always_inline));
              
 void
-p480_strong_reduce (
-    p480_t *inout
+gf_480_strong_reduce (
+    gf_480_t *inout
 );
   
 static __inline__ void
-p480_bias (
-    p480_t *inout,
+gf_480_bias (
+    gf_480_t *inout,
     int amount
 ) __attribute__((unused,always_inline));
          
 void
-p480_mul (
-    p480_t *__restrict__ out,
-    const p480_t *a,
-    const p480_t *b
+gf_480_mul (
+    gf_480_t *__restrict__ out,
+    const gf_480_t *a,
+    const gf_480_t *b
 );
 
 void
-p480_mulw (
-    p480_t *__restrict__ out,
-    const p480_t *a,
+gf_480_mulw (
+    gf_480_t *__restrict__ out,
+    const gf_480_t *a,
     uint64_t b
 );
 
 void
-p480_sqr (
-    p480_t *__restrict__ out,
-    const p480_t *a
+gf_480_sqr (
+    gf_480_t *__restrict__ out,
+    const gf_480_t *a
 );
 
 void
-p480_serialize (
+gf_480_serialize (
     uint8_t *serial,
-    const struct p480_t *x
+    const struct gf_480_t *x
 );
 
 mask_t
-p480_deserialize (
-    p480_t *x,
+gf_480_deserialize (
+    gf_480_t *x,
     const uint8_t serial[60]
 );
 
 /* -------------- Inline functions begin here -------------- */
 
 void
-p480_add_RAW (
-    p480_t *out,
-    const p480_t *a,
-    const p480_t *b
+gf_480_add_RAW (
+    gf_480_t *out,
+    const gf_480_t *a,
+    const gf_480_t *b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -106,10 +86,10 @@ p480_add_RAW (
 }
 
 void
-p480_sub_RAW (
-    p480_t *out,
-    const p480_t *a,
-    const p480_t *b
+gf_480_sub_RAW (
+    gf_480_t *out,
+    const gf_480_t *a,
+    const gf_480_t *b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -124,9 +104,9 @@ p480_sub_RAW (
 }
 
 void
-p480_copy (
-    p480_t *out,
-    const p480_t *a
+gf_480_copy (
+    gf_480_t *out,
+    const gf_480_t *a
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
@@ -135,8 +115,8 @@ p480_copy (
 }
 
 void
-p480_bias (
-    p480_t *a,
+gf_480_bias (
+    gf_480_t *a,
     int amt
 ) {
     uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
@@ -162,8 +142,8 @@ p480_bias (
 }
 
 void
-p480_weak_reduce (
-    p480_t *a
+gf_480_weak_reduce (
+    gf_480_t *a
 ) {
     /* PERF: use pshufb/palignr if anyone cares about speed of this */
     uint64_t mask = (1ull<<60) - 1;
@@ -180,4 +160,4 @@ p480_weak_reduce (
 }; /* extern "C" */
 #endif
 
-#endif /* __p480_H__ */
+#endif /* __gf_480_H__ */
diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c
index 0bff961..8670cd6 100644
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -2,7 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
+#include "f_field.h"
 
 static __inline__ __uint128_t widemul(
     const uint64_t a,
@@ -17,10 +17,10 @@ static __inline__ uint64_t is_zero(uint64_t a) {
 }
 
 void
-p521_mul (
-    p521_t *__restrict__ cs,
-    const p521_t *as,
-    const p521_t *bs
+gf_521_mul (
+    gf_521_t *__restrict__ cs,
+    const gf_521_t *as,
+    const gf_521_t *bs
 ) {
     uint64_t *c = cs->limb;
     const uint64_t *a = as->limb, *b = bs->limb;
@@ -158,9 +158,9 @@ p521_mul (
 }
 
 void
-p521_mulw (
-    p521_t *__restrict__ cs,
-    const p521_t *as,
+gf_521_mulw (
+    gf_521_t *__restrict__ cs,
+    const gf_521_t *as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb;
@@ -197,9 +197,9 @@ p521_mulw (
 }
 
 void
-p521_sqr (
-    p521_t *__restrict__ cs,
-    const p521_t *as
+gf_521_sqr (
+    gf_521_t *__restrict__ cs,
+    const gf_521_t *as
 ) {
     uint64_t *c = cs->limb;
     const uint64_t *a = as->limb;
@@ -306,8 +306,8 @@ p521_sqr (
 }
 
 void
-p521_strong_reduce (
-    p521_t *a
+gf_521_strong_reduce (
+    gf_521_t *a
 ) {
     uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;
 
@@ -347,14 +347,14 @@ p521_strong_reduce (
 }
 
 void
-p521_serialize (
+gf_521_serialize (
     uint8_t *serial,
-    const struct p521_t *x
+    const struct gf_521_t *x
 ) {
     int i,k=0;
-    p521_t red;
-    p521_copy(&red, x);
-    p521_strong_reduce(&red);
+    gf_521_t red;
+    gf_521_copy(&red, x);
+    gf_521_strong_reduce(&red);
     
     uint64_t r=0;
     int bits = 0;
@@ -371,8 +371,8 @@ p521_serialize (
 }
 
 mask_t
-p521_deserialize (
-    p521_t *x,
+gf_521_deserialize (
+    gf_521_t *x,
     const uint8_t serial[66]
 ) {
     int i,k=0,bits=0;
diff --git a/src/p521/arch_ref64/f_impl.h b/src/p521/arch_ref64/f_impl.h
index 2b63f13..512b1d9 100644
--- a/src/p521/arch_ref64/f_impl.h
+++ b/src/p521/arch_ref64/f_impl.h
@@ -10,122 +10,102 @@
 
 #include "word.h"
 
-typedef struct p521_t {
+typedef struct gf_521_t {
   uint64_t limb[9];
-} p521_t;
+} gf_521_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-static __inline__ void
-p521_add_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
-) __attribute__((unused));
-             
-static __inline__ void
-p521_sub_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
-) __attribute__((unused));
-             
-static __inline__ void
-p521_copy (
-    p521_t *out,
-    const p521_t *a
-) __attribute__((unused));
              
 static __inline__ void
-p521_weak_reduce (
-    p521_t *inout
+gf_521_weak_reduce (
+    gf_521_t *inout
 ) __attribute__((unused));
              
 void
-p521_strong_reduce (
-    p521_t *inout
+gf_521_strong_reduce (
+    gf_521_t *inout
 );
 
 static __inline__ void
-p521_bias (
-    p521_t *inout,
+gf_521_bias (
+    gf_521_t *inout,
     int amount
 ) __attribute__((unused));
          
 void
-p521_mul (
-    p521_t *__restrict__ out,
-    const p521_t *a,
-    const p521_t *b
+gf_521_mul (
+    gf_521_t *__restrict__ out,
+    const gf_521_t *a,
+    const gf_521_t *b
 );
 
 void
-p521_mulw (
-    p521_t *__restrict__ out,
-    const p521_t *a,
+gf_521_mulw (
+    gf_521_t *__restrict__ out,
+    const gf_521_t *a,
     uint64_t b
 );
 
 void
-p521_sqr (
-    p521_t *__restrict__ out,
-    const p521_t *a
+gf_521_sqr (
+    gf_521_t *__restrict__ out,
+    const gf_521_t *a
 );
 
 void
-p521_serialize (
+gf_521_serialize (
     uint8_t *serial,
-    const struct p521_t *x
+    const struct gf_521_t *x
 );
 
 mask_t
-p521_deserialize (
-    p521_t *x,
+gf_521_deserialize (
+    gf_521_t *x,
     const uint8_t serial[66]
 );
 
 /* -------------- Inline functions begin here -------------- */
 
 void
-p521_add_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
+gf_521_add_RAW (
+    gf_521_t *out,
+    const gf_521_t *a,
+    const gf_521_t *b
 ) {
     unsigned int i;
     for (i=0; i<9; i++) {
         out->limb[i] = a->limb[i] + b->limb[i];
     }
-    p521_weak_reduce(out);
+    gf_521_weak_reduce(out);
 }
 
 void
-p521_sub_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
+gf_521_sub_RAW (
+    gf_521_t *out,
+    const gf_521_t *a,
+    const gf_521_t *b
 ) {
     unsigned int i;
     uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
     for (i=0; i<9; i++) {
         out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
     }
-    p521_weak_reduce(out);
+    gf_521_weak_reduce(out);
 }
 
 void
-p521_copy (
-    p521_t *out,
-    const p521_t *a
+gf_521_copy (
+    gf_521_t *out,
+    const gf_521_t *a
 ) {
     memcpy(out,a,sizeof(*a));
 }
 
 void
-p521_bias (
-    p521_t *a,
+gf_521_bias (
+    gf_521_t *a,
     int amt
 ) {
     (void) a;
@@ -133,8 +113,8 @@ p521_bias (
 }
 
 void
-p521_weak_reduce (
-    p521_t *a
+gf_521_weak_reduce (
+    gf_521_t *a
 ) {
     uint64_t mask = (1ull<<58) - 1;
     uint64_t tmp = a->limb[8] >> 57;
diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c
index ba3e77b..0b42a4b 100644
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -2,7 +2,7 @@
  * Released under the MIT License.  See LICENSE.txt for license information.
  */
 
-#include "f_impl.h"
+#include "f_field.h"
 
 typedef struct {
   uint64x3_t lo, hi, hier;
@@ -168,10 +168,10 @@ static inline void hexad_sqr_signed (
 
 
 void
-p521_mul (
-    p521_t *__restrict__ cs,
-    const p521_t *as,
-    const p521_t *bs
+gf_521_mul (
+    gf_521_t *__restrict__ cs,
+    const gf_521_t *as,
+    const gf_521_t *bs
 ) {
     int i;
     
@@ -254,9 +254,9 @@ p521_mul (
 
 
 void
-p521_sqr (
-    p521_t *__restrict__ cs,
-    const p521_t *as
+gf_521_sqr (
+    gf_521_t *__restrict__ cs,
+    const gf_521_t *as
 ) {
     
 
@@ -313,9 +313,9 @@ p521_sqr (
 }
 
 void
-p521_mulw (
-    p521_t *__restrict__ cs,
-    const p521_t *as,
+gf_521_mulw (
+    gf_521_t *__restrict__ cs,
+    const gf_521_t *as,
     uint64_t b
 ) {
     
@@ -375,8 +375,8 @@ p521_mulw (
 
 
 void
-p521_strong_reduce (
-    p521_t *a
+gf_521_strong_reduce (
+    gf_521_t *a
 ) {
     uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;
 
@@ -418,14 +418,14 @@ p521_strong_reduce (
 }
 
 void
-p521_serialize (
+gf_521_serialize (
     uint8_t *serial,
-    const struct p521_t *x
+    const struct gf_521_t *x
 ) {
     unsigned int i,k=0;
-    p521_t red;
-    p521_copy(&red, x);
-    p521_strong_reduce(&red);
+    gf_521_t red;
+    gf_521_copy(&red, x);
+    gf_521_strong_reduce(&red);
     
     uint64_t r=0;
     int bits = 0;
@@ -442,8 +442,8 @@ p521_serialize (
 }
 
 mask_t
-p521_deserialize (
-    p521_t *x,
+gf_521_deserialize (
+    gf_521_t *x,
     const uint8_t serial[LIMBPERM(66)]
 ) {
     int i,k=0,bits=0;
diff --git a/src/p521/arch_x86_64_r12/f_impl.h b/src/p521/arch_x86_64_r12/f_impl.h
index 14ecb3f..4616b71 100644
--- a/src/p521/arch_x86_64_r12/f_impl.h
+++ b/src/p521/arch_x86_64_r12/f_impl.h
@@ -14,82 +14,14 @@
 #define LIMBPERM(x) (((x)%3)*4 + (x)/3)
 #define USE_P521_3x3_TRANSPOSE
 
-typedef struct p521_t {
+typedef struct gf_521_s {
   uint64_t limb[12];
-} __attribute__((aligned(32))) p521_t;
+} __attribute__((aligned(32))) gf_521_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static __inline__ void
-p521_add_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
-) __attribute__((unused));
-             
-static __inline__ void
-p521_sub_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
-) __attribute__((unused));
-             
-static __inline__ void
-p521_copy (
-    p521_t *out,
-    const p521_t *a
-) __attribute__((unused));
-             
-static __inline__ void
-p521_weak_reduce (
-    p521_t *inout
-) __attribute__((unused));
-             
-void
-p521_strong_reduce (
-    p521_t *inout
-);
-
-static __inline__ void
-p521_bias (
-    p521_t *inout,
-    int amount
-) __attribute__((unused));
-         
-void
-p521_mul (
-    p521_t *__restrict__ out,
-    const p521_t *a,
-    const p521_t *b
-);
-
-void
-p521_mulw (
-    p521_t *__restrict__ out,
-    const p521_t *a,
-    uint64_t b
-);
-
-void
-p521_sqr (
-    p521_t *__restrict__ out,
-    const p521_t *a
-);
-
-void
-p521_serialize (
-    uint8_t *serial,
-    const struct p521_t *x
-);
-
-mask_t
-p521_deserialize (
-    p521_t *x,
-    const uint8_t serial[66]
-);
-
 /* -------------- Inline functions begin here -------------- */
 
 typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
@@ -106,10 +38,10 @@ timesW (
 }
 
 void
-p521_add_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
+gf_521_add_RAW (
+    gf_521_t *out,
+    const gf_521_t *a,
+    const gf_521_t *b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -118,10 +50,10 @@ p521_add_RAW (
 }
 
 void
-p521_sub_RAW (
-    p521_t *out,
-    const p521_t *a,
-    const p521_t *b
+gf_521_sub_RAW (
+    gf_521_t *out,
+    const gf_521_t *a,
+    const gf_521_t *b
 ) {
     unsigned int i;
     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
@@ -130,16 +62,8 @@ p521_sub_RAW (
 }
 
 void
-p521_copy (
-    p521_t *out,
-    const p521_t *a
-) {
-    memcpy(out,a,sizeof(*a));
-}
-
-void
-p521_bias (
-    p521_t *a,
+gf_521_bias (
+    gf_521_t *a,
     int amt
 ) {
     uint64_t co0 = ((1ull<<58)-2)*amt, co1 = ((1ull<<58)-1)*amt;
@@ -150,8 +74,8 @@ p521_bias (
 }
 
 void
-p521_weak_reduce (
-    p521_t *a
+gf_521_weak_reduce (
+    gf_521_t *a
 ) {
 #if 0
     int i;