Skip to content

Commit ad1abe7

Browse files
authored
Merge pull request #27 from tfhe/ng/compile-on-mingw
compile on mingw64
2 parents 7600ba8 + dbb8a14 commit ad1abe7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+1701
-871
lines changed

CMakeLists.txt

+23-1
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,13 @@ endif()
2121
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
2222

2323
if (WARNING_PARANOID)
24-
add_compile_options(-Wall -Werror)
24+
add_compile_options(-Wall -Werror -Wno-unused-command-line-argument)
2525
endif()
2626

27+
message(STATUS "CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
28+
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
29+
message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
30+
2731
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
2832
set(X86 ON)
2933
set(AARCH64 OFF)
@@ -35,6 +39,24 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)")
3539
set(AARCH64 ON)
3640
endif ()
3741

42+
if (CMAKE_SYSTEM_NAME MATCHES "(Windows)|(MSYS)")
43+
set(WIN32 ON)
44+
endif ()
45+
if (WIN32)
46+
#overrides for win32
47+
set(X86 OFF)
48+
set(AARCH64 OFF)
49+
set(X86_WIN32 ON)
50+
else()
51+
set(X86_WIN32 OFF)
52+
set(WIN32 OFF)
53+
endif (WIN32)
54+
55+
message(STATUS "--> WIN32: ${WIN32}")
56+
message(STATUS "--> X86_WIN32: ${X86_WIN32}")
57+
message(STATUS "--> X86_LINUX: ${X86}")
58+
message(STATUS "--> AARCH64: ${AARCH64}")
59+
3860

3961
# compiles the main library in spqlios
4062
add_subdirectory(spqlios)

spqlios/CMakeLists.txt

+37-20
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ set(SRCS_GENERIC
3232
arithmetic/vec_znx_big.c
3333
arithmetic/znx_small.c
3434
arithmetic/module_api.c
35+
reim/reim_execute.c
36+
cplx/cplx_execute.c
37+
reim4/reim4_execute.c
3538
)
3639
# C or assembly source files compiled only on x86 targets
3740
set(SRCS_X86
38-
cplx/spqlios_fft_execute.s
39-
reim/reim_execute.s
40-
reim4/reim4_execute.s
4141
)
4242
# C or assembly source files compiled only on aarch64 targets
4343
set(SRCS_AARCH64
@@ -48,31 +48,40 @@ set(SRCS_AARCH64
4848
)
4949

5050
# C or assembly source files compiled only on x86: avx, avx2, fma targets
51-
set(SRCS_FMA
51+
set(SRCS_FMA_C
5252
arithmetic/vector_matrix_product_avx.c
5353
cplx/cplx_conversions_avx2_fma.c
54-
cplx/cplx_fft16_avx_fma.s
5554
cplx/cplx_fft_avx2_fma.c
5655
cplx/cplx_fft_sse.c
5756
cplx/cplx_fftvec_avx2_fma.c
58-
cplx/cplx_ifft16_avx_fma.s
5957
cplx/cplx_ifft_avx2_fma.c
6058
reim4/reim4_arithmetic_avx2.c
61-
reim4/reim4_fftvec_addmul_fma.s
62-
reim4/reim4_fftvec_conv_fma.s
59+
reim4/reim4_fftvec_conv_fma.c
60+
reim4/reim4_fftvec_addmul_fma.c
6361
reim/reim_conversions_avx.c
64-
reim/reim_fft16_avx_fma.s
65-
reim/reim_fft4_avx_fma.s
66-
reim/reim_fft8_avx_fma.s
62+
reim/reim_fft4_avx_fma.c
63+
reim/reim_fft8_avx_fma.c
64+
reim/reim_ifft4_avx_fma.c
65+
reim/reim_ifft8_avx_fma.c
6766
reim/reim_fft_avx2.c
68-
reim/reim_fftvec_addmul_fma.s
69-
reim/reim_ifft16_avx_fma.s
70-
reim/reim_ifft4_avx_fma.s
71-
reim/reim_ifft8_avx_fma.s
7267
reim/reim_ifft_avx2.c
7368
reim/reim_to_tnx_avx.c
74-
)
75-
set_source_files_properties(${SRCS_FMA} PROPERTIES COMPILE_OPTIONS "-mfma;-mavx;-mavx2")
69+
reim/reim_fftvec_addmul_fma.c
70+
)
71+
set(SRCS_FMA_ASM
72+
cplx/cplx_fft16_avx_fma.s
73+
cplx/cplx_ifft16_avx_fma.s
74+
reim/reim_fft16_avx_fma.s
75+
reim/reim_ifft16_avx_fma.s
76+
)
77+
set(SRCS_FMA_WIN32_ASM
78+
cplx/cplx_fft16_avx_fma_win32.s
79+
cplx/cplx_ifft16_avx_fma_win32.s
80+
reim/reim_fft16_avx_fma_win32.s
81+
reim/reim_ifft16_avx_fma_win32.s
82+
)
83+
set_source_files_properties(${SRCS_FMA_C} PROPERTIES COMPILE_OPTIONS "-mfma;-mavx;-mavx2")
84+
set_source_files_properties(${SRCS_FMA_ASM} PROPERTIES COMPILE_OPTIONS "-mfma;-mavx;-mavx2")
7685

7786
# C or assembly source files compiled only on x86: avx512f/vl/dq + fma targets
7887
set(SRCS_AVX512
@@ -132,12 +141,20 @@ set(SPQLIOSSOURCES
132141
if (${X86})
133142
set(SPQLIOSSOURCES ${SPQLIOSSOURCES}
134143
${SRCS_X86}
135-
${SRCS_FMA}
144+
${SRCS_FMA_C}
145+
${SRCS_FMA_ASM}
136146
${SRCS_AVX2}
137147
${SRCS_AVX512}
138148
)
139-
endif ()
140-
if (${AARCH64})
149+
elseif (${X86_WIN32})
150+
set(SPQLIOSSOURCES ${SPQLIOSSOURCES}
151+
#${SRCS_X86}
152+
${SRCS_FMA_C}
153+
${SRCS_FMA_WIN32_ASM}
154+
${SRCS_AVX2}
155+
${SRCS_AVX512}
156+
)
157+
elseif (${AARCH64})
141158
set(SPQLIOSSOURCES ${SPQLIOSSOURCES}
142159
${SRCS_AARCH64}
143160
)

spqlios/arithmetic/scalar_vector_product.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ EXPORT SVP_PPOL* svp_ppol_alloc(const MODULE* module) // N
99

1010
EXPORT SVP_PPOL* fft64_svp_ppol_alloc(const MODULE* module) {
1111
const uint64_t rsize = module->nn * sizeof(double);
12-
SVP_PPOL* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
12+
SVP_PPOL* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
1313
if (reps == 0) FATAL_ERROR("Out of memory");
1414
return reps;
1515
}

spqlios/arithmetic/vec_znx_big.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ EXPORT void vec_znx_big_automorphism(const MODULE* module, // N
8888
EXPORT VEC_ZNX_BIG* fft64_vec_znx_big_alloc(const MODULE* module, // N
8989
uint64_t size) {
9090
const uint64_t rsize = module->nn * size * sizeof(double);
91-
VEC_ZNX_BIG* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
91+
VEC_ZNX_BIG* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
9292
if (reps == 0) FATAL_ERROR("Out of memory");
9393
return reps;
9494
}

spqlios/arithmetic/vec_znx_dft.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ EXPORT VEC_ZNX_DFT* vec_znx_dft_alloc(const MODULE* module, // N
3737
EXPORT VEC_ZNX_DFT* fft64_vec_znx_dft_alloc(const MODULE* module, // N
3838
uint64_t size) {
3939
const uint64_t rsize = module->nn * size * sizeof(double);
40-
VEC_ZNX_DFT* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
40+
VEC_ZNX_DFT* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
4141
if (reps == 0) FATAL_ERROR("Out of memory");
4242
return reps;
4343
}

spqlios/arithmetic/vector_matrix_product.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ EXPORT VMP_PMAT* fft64_vmp_pmat_alloc(const MODULE* module, // N
1515
uint64_t nrows, uint64_t ncols // dimensions
1616
) {
1717
const uint64_t rsize = module->nn * nrows * ncols * sizeof(double);
18-
VMP_PMAT* reps = aligned_alloc(64, (rsize + 63) & (-64UL));
18+
VMP_PMAT* reps = aligned_alloc(64, (rsize + 63) & (UINT64_C(-64)));
1919
if (reps == 0) FATAL_ERROR("Out of memory");
2020
return reps;
2121
}

spqlios/commons.c

+97
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "commons.h"
22

3+
#include <math.h>
34
#include <stdio.h>
45
#include <stdlib.h>
56

@@ -18,3 +19,99 @@ EXPORT void NOT_IMPLEMENTED_v_vp(void* p) { NOT_IMPLEMENTED(); }
1819
EXPORT void NOT_IMPLEMENTED_v_idpdpdp(int32_t n, double* a, const double* b, const double* c) { NOT_IMPLEMENTED(); }
1920
EXPORT void NOT_IMPLEMENTED_v_uvpcvpcvp(uint32_t n, void* r, const void* a, const void* b) { NOT_IMPLEMENTED(); }
2021
EXPORT void NOT_IMPLEMENTED_v_uvpvpcvp(uint32_t n, void* a, void* b, const void* o) { NOT_IMPLEMENTED(); }
22+
23+
#ifdef _WIN32
24+
EXPORT void* aligned_alloc(size_t align, size_t n) {
25+
return malloc(n);
26+
// unfortunately, there is no alternative that gets freed with free :(
27+
}
28+
#define __always_inline inline __attribute((always_inline))
29+
#endif
30+
31+
void internal_accurate_sincos(double* rcos, double* rsin, double x) {
32+
double _4_x_over_pi = 4 * x / M_PI;
33+
int64_t int_part = ((int64_t)rint(_4_x_over_pi)) & 7;
34+
double frac_part = _4_x_over_pi - (double)(int_part);
35+
double frac_x = M_PI * frac_part / 4.;
36+
// compute the taylor series
37+
double cosp = 1.;
38+
double sinp = 0.;
39+
double powx = 1.;
40+
int64_t nn = 0;
41+
while (fabs(powx) > 1e-20) {
42+
++nn;
43+
powx = powx * frac_x / (double)(nn); // x^n/n!
44+
switch (nn & 3) {
45+
case 0:
46+
cosp += powx;
47+
break;
48+
case 1:
49+
sinp += powx;
50+
break;
51+
case 2:
52+
cosp -= powx;
53+
break;
54+
case 3:
55+
sinp -= powx;
56+
break;
57+
default:
58+
abort(); // impossible
59+
}
60+
}
61+
// final multiplication
62+
switch (int_part) {
63+
case 0:
64+
*rcos = cosp;
65+
*rsin = sinp;
66+
break;
67+
case 1:
68+
*rcos = M_SQRT1_2 * (cosp - sinp);
69+
*rsin = M_SQRT1_2 * (cosp + sinp);
70+
break;
71+
case 2:
72+
*rcos = -sinp;
73+
*rsin = cosp;
74+
break;
75+
case 3:
76+
*rcos = -M_SQRT1_2 * (cosp + sinp);
77+
*rsin = M_SQRT1_2 * (cosp - sinp);
78+
break;
79+
case 4:
80+
*rcos = -cosp;
81+
*rsin = -sinp;
82+
break;
83+
case 5:
84+
*rcos = -M_SQRT1_2 * (cosp - sinp);
85+
*rsin = -M_SQRT1_2 * (cosp + sinp);
86+
break;
87+
case 6:
88+
*rcos = sinp;
89+
*rsin = -cosp;
90+
break;
91+
case 7:
92+
*rcos = M_SQRT1_2 * (cosp + sinp);
93+
*rsin = -M_SQRT1_2 * (cosp - sinp);
94+
break;
95+
default:
96+
abort(); // impossible
97+
}
98+
if (fabs(cos(x) - *rcos) > 1e-10 || fabs(sin(x) - *rsin) > 1e-10) {
99+
printf("cos(%.17lf) =? %.17lf instead of %.17lf\n", x, *rcos, cos(x));
100+
printf("sin(%.17lf) =? %.17lf instead of %.17lf\n", x, *rsin, sin(x));
101+
printf("fracx = %.17lf\n", frac_x);
102+
printf("cosp = %.17lf\n", cosp);
103+
printf("sinp = %.17lf\n", sinp);
104+
printf("nn = %d\n", (int)(nn));
105+
}
106+
}
107+
108+
double internal_accurate_cos(double x) {
109+
double rcos, rsin;
110+
internal_accurate_sincos(&rcos, &rsin, x);
111+
return rcos;
112+
}
113+
double internal_accurate_sin(double x) {
114+
double rcos, rsin;
115+
internal_accurate_sincos(&rcos, &rsin, x);
116+
return rsin;
117+
}

spqlios/commons.h

+24-7
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,6 @@
3232
abort(); \
3333
}
3434

35-
#ifdef __x86_64__
36-
#define CPU_SUPPORTS __builtin_cpu_supports
37-
#else
38-
// TODO for now, we do not have any optimization for non x86 targets
39-
#define CPU_SUPPORTS(xxxx) 0
40-
#endif
41-
4235
EXPORT void* UNDEFINED_p_ii(int32_t n, int32_t m);
4336
EXPORT void* UNDEFINED_p_uu(uint32_t n, uint32_t m);
4437
EXPORT double* UNDEFINED_dp_pi(const void* p, int32_t n);
@@ -55,4 +48,28 @@ EXPORT void NOT_IMPLEMENTED_v_idpdpdp(int32_t n, double* a, const double* b, con
5548
EXPORT void NOT_IMPLEMENTED_v_uvpcvpcvp(uint32_t n, void* r, const void* a, const void* b);
5649
EXPORT void NOT_IMPLEMENTED_v_uvpvpcvp(uint32_t n, void* a, void* b, const void* o);
5750

51+
// windows
52+
#ifdef _WIN32
53+
EXPORT void* aligned_alloc(size_t align, size_t n);
54+
#ifdef __cplusplus
55+
#define aligned_alloc ::aligned_alloc
56+
#endif
57+
#define __always_inline inline __attribute((always_inline))
58+
#endif
59+
60+
#define USE_LIBM_SIN_COS
61+
#ifndef USE_LIBM_SIN_COS
62+
// if at some point, we want to remove the libm dependency, we can
63+
// consider this:
64+
EXPORT double internal_accurate_cos(double x);
65+
EXPORT double internal_accurate_sin(double x);
66+
EXPORT void internal_accurate_sincos(double* rcos, double* rsin, double x);
67+
#define m_accurate_cos internal_accurate_cos
68+
#define m_accurate_sin internal_accurate_sin
69+
#else
70+
// let's use libm sin and cos
71+
#define m_accurate_cos cos
72+
#define m_accurate_sin sin
73+
#endif
74+
5875
#endif // SPQLIOS_COMMONS_H

spqlios/commons_private.c

+2-6
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,6 @@ double fracrevbits(uint32_t i) {
5050
return fracrevbits((i - 1) / 2) / 2. + 0.5;
5151
}
5252

53-
uint64_t ceilto64b(uint64_t size) {
54-
return (size + 63UL) & (-64UL);
55-
}
53+
uint64_t ceilto64b(uint64_t size) { return (size + UINT64_C(63)) & (UINT64_C(-64)); }
5654

57-
uint64_t ceilto32b(uint64_t size) {
58-
return (size + 31UL) & (-32UL);
59-
}
55+
uint64_t ceilto32b(uint64_t size) { return (size + UINT64_C(31)) & (UINT64_C(-32)); }

spqlios/cplx/cplx_conversions.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ EXPORT void cplx_from_znx32_ref(const CPLX_FROM_ZNX32_PRECOMP* precomp, void* r,
1717
}
1818

1919
EXPORT void cplx_from_tnx32_ref(const CPLX_FROM_TNX32_PRECOMP* precomp, void* r, const int32_t* x) {
20-
static const double _2p32 = 1. / (1L << 32);
20+
static const double _2p32 = 1. / (INT64_C(1) << 32);
2121
const uint32_t m = precomp->m;
2222
const int32_t* inre = x;
2323
const int32_t* inim = x + m;
@@ -29,7 +29,7 @@ EXPORT void cplx_from_tnx32_ref(const CPLX_FROM_TNX32_PRECOMP* precomp, void* r,
2929
}
3030

3131
EXPORT void cplx_to_tnx32_ref(const CPLX_TO_TNX32_PRECOMP* precomp, int32_t* r, const void* x) {
32-
static const double _2p32 = (1L << 32);
32+
static const double _2p32 = (INT64_C(1) << 32);
3333
const uint32_t m = precomp->m;
3434
double factor = _2p32 / precomp->divisor;
3535
int32_t* outre = r;

spqlios/cplx/cplx_conversions_avx2_fma.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ __always_inline void cplx_from_any_fma(uint64_t m, void* r, const int32_t* x, co
4747
EXPORT void cplx_from_znx32_avx2_fma(const CPLX_FROM_ZNX32_PRECOMP* precomp, void* r, const int32_t* x) {
4848
//note: the hex code of 2^31 + 2^52 is 0x4330000080000000
4949
const __m256i C = _mm256_set1_epi32(0x43300000);
50-
const __m256d R = _mm256_set1_pd((1L<<31) + (1L<<52));
51-
//double XX = 1L + (1L<<31) + (1L<<52);
50+
const __m256d R = _mm256_set1_pd((INT64_C(1) << 31) + (INT64_C(1) << 52));
51+
// double XX = INT64_C(1) + (INT64_C(1)<<31) + (INT64_C(1)<<52);
5252
//printf("\n\n%016lx\n", *(uint64_t*)&XX);
5353
//abort();
5454
const uint64_t m = precomp->m;
@@ -58,16 +58,16 @@ EXPORT void cplx_from_znx32_avx2_fma(const CPLX_FROM_ZNX32_PRECOMP* precomp, voi
5858
EXPORT void cplx_from_tnx32_avx2_fma(const CPLX_FROM_TNX32_PRECOMP* precomp, void* r, const int32_t* x) {
5959
//note: the hex code of 2^-1 + 2^30 is 0x4130000080000000
6060
const __m256i C = _mm256_set1_epi32(0x41300000);
61-
const __m256d R = _mm256_set1_pd(0.5 + (1L<<20));
62-
//double XX = (double)(1L + (1L<<31) + (1L<<52))/(1L<<32);
61+
const __m256d R = _mm256_set1_pd(0.5 + (INT64_C(1) << 20));
62+
// double XX = (double)(INT64_C(1) + (INT64_C(1)<<31) + (INT64_C(1)<<52))/(INT64_C(1)<<32);
6363
//printf("\n\n%016lx\n", *(uint64_t*)&XX);
6464
//abort();
6565
const uint64_t m = precomp->m;
6666
cplx_from_any_fma(m, r, x, C, R);
6767
}
6868

6969
EXPORT void cplx_to_tnx32_avx2_fma(const CPLX_TO_TNX32_PRECOMP* precomp, int32_t* r, const void* x) {
70-
const __m256d R = _mm256_set1_pd((0.5 + (3L<<19)) * precomp->divisor);
70+
const __m256d R = _mm256_set1_pd((0.5 + (INT64_C(3) << 19)) * precomp->divisor);
7171
const __m256i MASK = _mm256_set1_epi64x(0xFFFFFFFFUL);
7272
const __m256i S = _mm256_set1_epi32(0x80000000);
7373
//const __m256i IDX = _mm256_set_epi32(0,4,1,5,2,6,3,7);

0 commit comments

Comments
 (0)