tfhe
diff --git a/‎CMakeLists.txt
+23-1 b/‎CMakeLists.txt
+23-1
diff --git a/‎spqlios/CMakeLists.txt
+37-20 b/‎spqlios/CMakeLists.txt
+37-20
diff --git a/‎spqlios/arithmetic/scalar_vector_product.c
+1-1 b/‎spqlios/arithmetic/scalar_vector_product.c
+1-1
diff --git a/‎spqlios/arithmetic/vec_znx_big.c
+1-1 b/‎spqlios/arithmetic/vec_znx_big.c
+1-1
diff --git a/‎spqlios/arithmetic/vec_znx_dft.c
+1-1 b/‎spqlios/arithmetic/vec_znx_dft.c
+1-1
diff --git a/‎spqlios/arithmetic/vector_matrix_product.c
+1-1 b/‎spqlios/arithmetic/vector_matrix_product.c
+1-1
diff --git a/‎spqlios/commons.c
+97 b/‎spqlios/commons.c
+97
diff --git a/‎spqlios/commons.h
+24-7 b/‎spqlios/commons.h
+24-7
diff --git a/‎spqlios/commons_private.c
+2-6 b/‎spqlios/commons_private.c
+2-6
diff --git a/‎spqlios/cplx/cplx_conversions.c
+2-2 b/‎spqlios/cplx/cplx_conversions.c
+2-2
diff --git a/‎spqlios/cplx/cplx_conversions_avx2_fma.c
+5-5 b/‎spqlios/cplx/cplx_conversions_avx2_fma.c
+5-5
@@ -21,9 +21,13 @@ endif()
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
 if (WARNING_PARANOID)
-    add_compile_options(-Wall -Werror)
+	add_compile_options(-Wall -Werror -Wno-unused-command-line-argument)
 endif()
 
+message(STATUS "CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
+
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
     set(X86 ON)
     set(AARCH64 OFF)
@@ -35,6 +39,24 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)")
     set(AARCH64 ON)
 endif ()
 
+if (CMAKE_SYSTEM_NAME MATCHES "(Windows)|(MSYS)")
+    set(WIN32 ON)
+endif ()
+if (WIN32)
+    #overrides for win32
+    set(X86 OFF)
+    set(AARCH64 OFF)
+    set(X86_WIN32 ON)
+else()
+    set(X86_WIN32 OFF)
+    set(WIN32 OFF)
+endif (WIN32)
+
+message(STATUS "--> WIN32: ${WIN32}")
+message(STATUS "--> X86_WIN32: ${X86_WIN32}")
+message(STATUS "--> X86_LINUX: ${X86}")
+message(STATUS "--> AARCH64: ${AARCH64}")
+
 
 # compiles the main library in spqlios
 add_subdirectory(spqlios)
 
@@ -32,12 +32,12 @@ set(SRCS_GENERIC
         arithmetic/vec_znx_big.c
         arithmetic/znx_small.c
         arithmetic/module_api.c
+        reim/reim_execute.c
+        cplx/cplx_execute.c
+        reim4/reim4_execute.c
 )
 # C or assembly source files compiled only on x86 targets
 set(SRCS_X86
-        cplx/spqlios_fft_execute.s
-        reim/reim_execute.s
-        reim4/reim4_execute.s
         )
 # C or assembly source files compiled only on aarch64 targets
 set(SRCS_AARCH64
@@ -48,31 +48,40 @@ set(SRCS_AARCH64
         )
 
 # C or assembly source files compiled only on x86: avx, avx2, fma targets
-set(SRCS_FMA
+set(SRCS_FMA_C
         arithmetic/vector_matrix_product_avx.c
         cplx/cplx_conversions_avx2_fma.c
-        cplx/cplx_fft16_avx_fma.s
         cplx/cplx_fft_avx2_fma.c
         cplx/cplx_fft_sse.c
         cplx/cplx_fftvec_avx2_fma.c
-        cplx/cplx_ifft16_avx_fma.s
         cplx/cplx_ifft_avx2_fma.c
         reim4/reim4_arithmetic_avx2.c
-        reim4/reim4_fftvec_addmul_fma.s
-        reim4/reim4_fftvec_conv_fma.s
+        reim4/reim4_fftvec_conv_fma.c
+        reim4/reim4_fftvec_addmul_fma.c
         reim/reim_conversions_avx.c
-        reim/reim_fft16_avx_fma.s
-        reim/reim_fft4_avx_fma.s
-        reim/reim_fft8_avx_fma.s
+        reim/reim_fft4_avx_fma.c
+        reim/reim_fft8_avx_fma.c
+        reim/reim_ifft4_avx_fma.c
+        reim/reim_ifft8_avx_fma.c
         reim/reim_fft_avx2.c
-        reim/reim_fftvec_addmul_fma.s
-        reim/reim_ifft16_avx_fma.s
-        reim/reim_ifft4_avx_fma.s
-        reim/reim_ifft8_avx_fma.s
         reim/reim_ifft_avx2.c
         reim/reim_to_tnx_avx.c
-        )
-set_source_files_properties(${SRCS_FMA} PROPERTIES COMPILE_OPTIONS "-mfma;-mavx;-mavx2")
+        reim/reim_fftvec_addmul_fma.c
+)
+set(SRCS_FMA_ASM
+        cplx/cplx_fft16_avx_fma.s
+        cplx/cplx_ifft16_avx_fma.s
+        reim/reim_fft16_avx_fma.s
+        reim/reim_ifft16_avx_fma.s
+)
+set(SRCS_FMA_WIN32_ASM
+        cplx/cplx_fft16_avx_fma_win32.s
+        cplx/cplx_ifft16_avx_fma_win32.s
+        reim/reim_fft16_avx_fma_win32.s
+        reim/reim_ifft16_avx_fma_win32.s
+)
+set_source_files_properties(${SRCS_FMA_C} PROPERTIES COMPILE_OPTIONS "-mfma;-mavx;-mavx2")
+set_source_files_properties(${SRCS_FMA_ASM} PROPERTIES COMPILE_OPTIONS "-mfma;-mavx;-mavx2")
 
 # C or assembly source files compiled only on x86: avx512f/vl/dq + fma targets
 set(SRCS_AVX512
@@ -132,12 +141,20 @@ set(SPQLIOSSOURCES
 if (${X86})
     set(SPQLIOSSOURCES ${SPQLIOSSOURCES}
             ${SRCS_X86}
-            ${SRCS_FMA}
+            ${SRCS_FMA_C}
+            ${SRCS_FMA_ASM}
             ${SRCS_AVX2}
             ${SRCS_AVX512}
             )
-endif ()
-if (${AARCH64})
+elseif (${X86_WIN32})
+    set(SPQLIOSSOURCES ${SPQLIOSSOURCES}
+            #${SRCS_X86}
+            ${SRCS_FMA_C}
+            ${SRCS_FMA_WIN32_ASM}
+            ${SRCS_AVX2}
+            ${SRCS_AVX512}
+    )
+elseif (${AARCH64})
     set(SPQLIOSSOURCES ${SPQLIOSSOURCES}
             ${SRCS_AARCH64}
             )
 
@@ -9,7 +9,7 @@ EXPORT SVP_PPOL* svp_ppol_alloc(const MODULE* module)  // N
 
 EXPORT SVP_PPOL* fft64_svp_ppol_alloc(const MODULE* module) {
   const uint64_t rsize = module->nn * sizeof(double);
-  SVP_PPOL* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
+  SVP_PPOL* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
   if (reps == 0) FATAL_ERROR("Out of memory");
   return reps;
 }
 
@@ -88,7 +88,7 @@ EXPORT void vec_znx_big_automorphism(const MODULE* module,                  // N
 EXPORT VEC_ZNX_BIG* fft64_vec_znx_big_alloc(const MODULE* module,  // N
                                             uint64_t size) {
   const uint64_t rsize = module->nn * size * sizeof(double);
-  VEC_ZNX_BIG* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
+  VEC_ZNX_BIG* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
   if (reps == 0) FATAL_ERROR("Out of memory");
   return reps;
 }
 
@@ -37,7 +37,7 @@ EXPORT VEC_ZNX_DFT* vec_znx_dft_alloc(const MODULE* module,  // N
 EXPORT VEC_ZNX_DFT* fft64_vec_znx_dft_alloc(const MODULE* module,  // N
                                             uint64_t size) {
   const uint64_t rsize = module->nn * size * sizeof(double);
-  VEC_ZNX_DFT* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
+  VEC_ZNX_DFT* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
   if (reps == 0) FATAL_ERROR("Out of memory");
   return reps;
 }
 
@@ -15,7 +15,7 @@ EXPORT VMP_PMAT* fft64_vmp_pmat_alloc(const MODULE* module,           // N
                                       uint64_t nrows, uint64_t ncols  // dimensions
 ) {
   const uint64_t rsize = module->nn * nrows * ncols * sizeof(double);
-  VMP_PMAT* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
+  VMP_PMAT* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
   if (reps == 0) FATAL_ERROR("Out of memory");
   return reps;
 }
 
@@ -1,5 +1,6 @@
 #include "commons.h"
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -18,3 +19,99 @@ EXPORT void NOT_IMPLEMENTED_v_vp(void* p) { NOT_IMPLEMENTED(); }
 EXPORT void NOT_IMPLEMENTED_v_idpdpdp(int32_t n, double* a, const double* b, const double* c) { NOT_IMPLEMENTED(); }
 EXPORT void NOT_IMPLEMENTED_v_uvpcvpcvp(uint32_t n, void* r, const void* a, const void* b) { NOT_IMPLEMENTED(); }
 EXPORT void NOT_IMPLEMENTED_v_uvpvpcvp(uint32_t n, void* a, void* b, const void* o) { NOT_IMPLEMENTED(); }
+
+#ifdef _WIN32
+EXPORT void* aligned_alloc(size_t align, size_t n) {
+  return malloc(n);
+  // unfortunately, there is no alternative that gets freed with free :(
+}
+#define __always_inline inline __attribute((always_inline))
+#endif
+
+void internal_accurate_sincos(double* rcos, double* rsin, double x) {
+  double _4_x_over_pi = 4 * x / M_PI;
+  int64_t int_part = ((int64_t)rint(_4_x_over_pi)) & 7;
+  double frac_part = _4_x_over_pi - (double)(int_part);
+  double frac_x = M_PI * frac_part / 4.;
+  // compute the taylor series
+  double cosp = 1.;
+  double sinp = 0.;
+  double powx = 1.;
+  int64_t nn = 0;
+  while (fabs(powx) > 1e-20) {
+    ++nn;
+    powx = powx * frac_x / (double)(nn);  // x^n/n!
+    switch (nn & 3) {
+      case 0:
+        cosp += powx;
+        break;
+      case 1:
+        sinp += powx;
+        break;
+      case 2:
+        cosp -= powx;
+        break;
+      case 3:
+        sinp -= powx;
+        break;
+      default:
+        abort();  // impossible
+    }
+  }
+  // final multiplication
+  switch (int_part) {
+    case 0:
+      *rcos = cosp;
+      *rsin = sinp;
+      break;
+    case 1:
+      *rcos = M_SQRT1_2 * (cosp - sinp);
+      *rsin = M_SQRT1_2 * (cosp + sinp);
+      break;
+    case 2:
+      *rcos = -sinp;
+      *rsin = cosp;
+      break;
+    case 3:
+      *rcos = -M_SQRT1_2 * (cosp + sinp);
+      *rsin = M_SQRT1_2 * (cosp - sinp);
+      break;
+    case 4:
+      *rcos = -cosp;
+      *rsin = -sinp;
+      break;
+    case 5:
+      *rcos = -M_SQRT1_2 * (cosp - sinp);
+      *rsin = -M_SQRT1_2 * (cosp + sinp);
+      break;
+    case 6:
+      *rcos = sinp;
+      *rsin = -cosp;
+      break;
+    case 7:
+      *rcos = M_SQRT1_2 * (cosp + sinp);
+      *rsin = -M_SQRT1_2 * (cosp - sinp);
+      break;
+    default:
+      abort();  // impossible
+  }
+  if (fabs(cos(x) - *rcos) > 1e-10 || fabs(sin(x) - *rsin) > 1e-10) {
+    printf("cos(%.17lf) =? %.17lf instead of %.17lf\n", x, *rcos, cos(x));
+    printf("sin(%.17lf) =? %.17lf instead of %.17lf\n", x, *rsin, sin(x));
+    printf("fracx = %.17lf\n", frac_x);
+    printf("cosp = %.17lf\n", cosp);
+    printf("sinp = %.17lf\n", sinp);
+    printf("nn = %d\n", (int)(nn));
+  }
+}
+
+double internal_accurate_cos(double x) {
+  double rcos, rsin;
+  internal_accurate_sincos(&rcos, &rsin, x);
+  return rcos;
+}
+double internal_accurate_sin(double x) {
+  double rcos, rsin;
+  internal_accurate_sincos(&rcos, &rsin, x);
+  return rsin;
+}
@@ -32,13 +32,6 @@
     abort();                                   \
   }
 
-#ifdef __x86_64__
-#define CPU_SUPPORTS __builtin_cpu_supports
-#else
-// TODO for now, we do not have any optimization for non x86 targets
-#define CPU_SUPPORTS(xxxx) 0
-#endif
-
 EXPORT void* UNDEFINED_p_ii(int32_t n, int32_t m);
 EXPORT void* UNDEFINED_p_uu(uint32_t n, uint32_t m);
 EXPORT double* UNDEFINED_dp_pi(const void* p, int32_t n);
@@ -55,4 +48,28 @@ EXPORT void NOT_IMPLEMENTED_v_idpdpdp(int32_t n, double* a, const double* b, con
 EXPORT void NOT_IMPLEMENTED_v_uvpcvpcvp(uint32_t n, void* r, const void* a, const void* b);
 EXPORT void NOT_IMPLEMENTED_v_uvpvpcvp(uint32_t n, void* a, void* b, const void* o);
 
+// windows
+#ifdef _WIN32
+EXPORT void* aligned_alloc(size_t align, size_t n);
+#ifdef __cplusplus
+#define aligned_alloc ::aligned_alloc
+#endif
+#define __always_inline inline __attribute((always_inline))
+#endif
+
+#define USE_LIBM_SIN_COS
+#ifndef USE_LIBM_SIN_COS
+// if at some point, we want to remove the libm dependency, we can
+// consider this:
+EXPORT double internal_accurate_cos(double x);
+EXPORT double internal_accurate_sin(double x);
+EXPORT void internal_accurate_sincos(double* rcos, double* rsin, double x);
+#define m_accurate_cos internal_accurate_cos
+#define m_accurate_sin internal_accurate_sin
+#else
+// let's use libm sin and cos
+#define m_accurate_cos cos
+#define m_accurate_sin sin
+#endif
+
 #endif  // SPQLIOS_COMMONS_H
@@ -50,10 +50,6 @@ double fracrevbits(uint32_t i) {
     return fracrevbits((i - 1) / 2) / 2. + 0.5;
 }
 
-uint64_t ceilto64b(uint64_t size) {
-  return (size + 63UL) & (-64UL);
-}
+uint64_t ceilto64b(uint64_t size) { return (size + UINT64_C(63)) & (UINT64_C(-64)); }
 
-uint64_t ceilto32b(uint64_t size) {
-  return (size + 31UL) & (-32UL);
-}
+uint64_t ceilto32b(uint64_t size) { return (size + UINT64_C(31)) & (UINT64_C(-32)); }
@@ -17,7 +17,7 @@ EXPORT void cplx_from_znx32_ref(const CPLX_FROM_ZNX32_PRECOMP* precomp, void* r,
 }
 
 EXPORT void cplx_from_tnx32_ref(const CPLX_FROM_TNX32_PRECOMP* precomp, void* r, const int32_t* x) {
-  static const double _2p32 = 1. / (1L << 32);
+  static const double _2p32 = 1. / (INT64_C(1) << 32);
   const uint32_t m = precomp->m;
   const int32_t* inre = x;
   const int32_t* inim = x + m;
@@ -29,7 +29,7 @@ EXPORT void cplx_from_tnx32_ref(const CPLX_FROM_TNX32_PRECOMP* precomp, void* r,
 }
 
 EXPORT void cplx_to_tnx32_ref(const CPLX_TO_TNX32_PRECOMP* precomp, int32_t* r, const void* x) {
-  static const double _2p32 = (1L << 32);
+  static const double _2p32 = (INT64_C(1) << 32);
   const uint32_t m = precomp->m;
   double factor = _2p32 / precomp->divisor;
   int32_t* outre = r;
 
@@ -47,8 +47,8 @@ __always_inline void cplx_from_any_fma(uint64_t m, void* r, const int32_t* x, co
 EXPORT void cplx_from_znx32_avx2_fma(const CPLX_FROM_ZNX32_PRECOMP* precomp, void* r, const int32_t* x) {
   //note: the hex code of 2^31 + 2^52 is 0x4330000080000000
   const __m256i C = _mm256_set1_epi32(0x43300000);
-  const __m256d R = _mm256_set1_pd((1L<<31) + (1L<<52));
-  //double XX =  1L + (1L<<31) + (1L<<52);
+  const __m256d R = _mm256_set1_pd((INT64_C(1) << 31) + (INT64_C(1) << 52));
+  // double XX =  INT64_C(1) + (INT64_C(1)<<31) + (INT64_C(1)<<52);
   //printf("\n\n%016lx\n", *(uint64_t*)&XX);
   //abort();
   const uint64_t m = precomp->m;
@@ -58,16 +58,16 @@ EXPORT void cplx_from_znx32_avx2_fma(const CPLX_FROM_ZNX32_PRECOMP* precomp, voi
 EXPORT void cplx_from_tnx32_avx2_fma(const CPLX_FROM_TNX32_PRECOMP* precomp, void* r, const int32_t* x) {
   //note: the hex code of 2^-1 + 2^30 is 0x4130000080000000
   const __m256i C = _mm256_set1_epi32(0x41300000);
-  const __m256d R = _mm256_set1_pd(0.5 + (1L<<20));
-  //double XX =  (double)(1L + (1L<<31) + (1L<<52))/(1L<<32);
+  const __m256d R = _mm256_set1_pd(0.5 + (INT64_C(1) << 20));
+  // double XX =  (double)(INT64_C(1) + (INT64_C(1)<<31) + (INT64_C(1)<<52))/(INT64_C(1)<<32);
   //printf("\n\n%016lx\n", *(uint64_t*)&XX);
   //abort();
   const uint64_t m = precomp->m;
   cplx_from_any_fma(m, r, x, C, R);
 }
 
 EXPORT void cplx_to_tnx32_avx2_fma(const CPLX_TO_TNX32_PRECOMP* precomp, int32_t* r, const void* x) {
-  const __m256d R = _mm256_set1_pd((0.5 + (3L<<19)) * precomp->divisor);
+  const __m256d R = _mm256_set1_pd((0.5 + (INT64_C(3) << 19)) * precomp->divisor);
   const __m256i MASK = _mm256_set1_epi64x(0xFFFFFFFFUL);
   const __m256i S = _mm256_set1_epi32(0x80000000);
   //const __m256i IDX = _mm256_set_epi32(0,4,1,5,2,6,3,7);
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ EXPORT SVP_PPOL* svp_ppol_alloc(const MODULE* module) // N`
`9`	`9`
`10`	`10`	`EXPORT SVP_PPOL* fft64_svp_ppol_alloc(const MODULE* module) {`
`11`	`11`	`const uint64_t rsize = module->nn * sizeof(double);`
`12`		`- SVP_PPOL* reps = aligned_alloc(64, (rsize + 63) & (-64UL));`
	`12`	`+ SVP_PPOL* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));`
`13`	`13`	`if (reps == 0) FATAL_ERROR("Out of memory");`
`14`	`14`	`return reps;`
`15`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ EXPORT void vec_znx_big_automorphism(const MODULE* module, // N`
`88`	`88`	`EXPORT VEC_ZNX_BIG* fft64_vec_znx_big_alloc(const MODULE* module, // N`
`89`	`89`	`uint64_t size) {`
`90`	`90`	`const uint64_t rsize = module->nn * size * sizeof(double);`
`91`		`- VEC_ZNX_BIG* reps = aligned_alloc(64, (rsize + 63) & (-64UL));`
	`91`	`+ VEC_ZNX_BIG* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));`
`92`	`92`	`if (reps == 0) FATAL_ERROR("Out of memory");`
`93`	`93`	`return reps;`
`94`	`94`	`}`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ EXPORT VEC_ZNX_DFT* vec_znx_dft_alloc(const MODULE* module, // N`
`37`	`37`	`EXPORT VEC_ZNX_DFT* fft64_vec_znx_dft_alloc(const MODULE* module, // N`
`38`	`38`	`uint64_t size) {`
`39`	`39`	`const uint64_t rsize = module->nn * size * sizeof(double);`
`40`		`- VEC_ZNX_DFT* reps = aligned_alloc(64, (rsize + 63) & (-64UL));`
	`40`	`+ VEC_ZNX_DFT* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));`
`41`	`41`	`if (reps == 0) FATAL_ERROR("Out of memory");`
`42`	`42`	`return reps;`
`43`	`43`	`}`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ EXPORT VMP_PMAT* fft64_vmp_pmat_alloc(const MODULE* module, // N`
`15`	`15`	`uint64_t nrows, uint64_t ncols // dimensions`
`16`	`16`	`) {`
`17`	`17`	`const uint64_t rsize = module->nn * nrows * ncols * sizeof(double);`
`18`		`- VMP_PMAT* reps = aligned_alloc(64, (rsize + 63) & (-64UL));`
	`18`	`+ VMP_PMAT* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));`
`19`	`19`	`if (reps == 0) FATAL_ERROR("Out of memory");`
`20`	`20`	`return reps;`
`21`	`21`	`}`