Skip to content

Commit e6d93f2

Browse files
authored
Merge pull request #2 from martin-frbg/develop
merge develop
2 parents 1833a67 + c38c65e commit e6d93f2

17 files changed

+419
-362
lines changed

Makefile

+21-7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
2121
RELA = re_lapack
2222
endif
2323

24+
ifeq ($(NO_FORTRAN), 1)
25+
define NOFORTRAN
26+
1
27+
endef
28+
define NO_LAPACK
29+
1
30+
endef
31+
export NOFORTRAN
32+
export NO_LAPACK
33+
endif
34+
2435
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
2536

2637
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@@ -47,7 +58,7 @@ endif
4758
endif
4859

4960
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
50-
ifndef NOFORTRAN
61+
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
5162
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
5263
endif
5364
ifneq ($(OSNAME), AIX)
@@ -108,7 +119,7 @@ endif
108119
endif
109120

110121
tests :
111-
ifndef NOFORTRAN
122+
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
112123
touch $(LIBNAME)
113124
ifndef NO_FBLAS
114125
$(MAKE) -C test all
@@ -210,7 +221,7 @@ netlib :
210221

211222
else
212223
netlib : lapack_prebuild
213-
ifndef NOFORTRAN
224+
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
214225
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
215226
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
216227
endif
@@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild
231242
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
232243

233244
lapack_prebuild :
234-
ifndef NOFORTRAN
245+
$(info filter value of NOFORTRAN is:)
246+
$(info x$(filter-out $(NOFORTRAN), 1 2)x)
247+
248+
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
235249
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
236250
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
237251
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -274,21 +288,21 @@ endif
274288
endif
275289

276290
large.tgz :
277-
ifndef NOFORTRAN
291+
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
278292
if [ ! -a $< ]; then
279293
-wget http://www.netlib.org/lapack/timing/large.tgz;
280294
fi
281295
endif
282296

283297
timing.tgz :
284-
ifndef NOFORTRAN
298+
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
285299
if [ ! -a $< ]; then
286300
-wget http://www.netlib.org/lapack/timing/timing.tgz;
287301
fi
288302
endif
289303

290304
lapack-timing : large.tgz timing.tgz
291-
ifndef NOFORTRAN
305+
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
292306
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
293307
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
294308
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

Makefile.rule

+13-2
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
6060
# This flag is always set for POWER8. Don't modify the flag
6161
# USE_OPENMP = 1
6262

63+
# The OpenMP scheduler to use - by default this is "static" and you
64+
# will normally not want to change this unless you know that your main
65+
# workload will involve tasks that have highly unbalanced running times
66+
# for individual threads. Changing away from "static" may also adversely
67+
# affect memory access locality in NUMA systems. Setting to "runtime" will
68+
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
69+
# CCOMMON_OPT += -DOMP_SCHED=dynamic
70+
6371
# You can define maximum number of threads. Basically it should be
6472
# less than actual number of cores. If you don't specify one, it's
6573
# automatically detected by the the script.
@@ -156,8 +164,11 @@ NO_AFFINITY = 1
156164
# CONSISTENT_FPCSR = 1
157165

158166
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
159-
# with single thread. You can use this flag to avoid the overhead of multi-threading
160-
# in small matrix sizes. The default value is 4.
167+
# with single thread. (Actually in recent versions this is a factor proportional to the
168+
# number of floating point operations necessary for the given problem size, no longer
169+
# an individual dimension). You can use this setting to avoid the overhead of multi-
170+
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
171+
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
161172
# GEMM_MULTITHREAD_THRESHOLD = 4
162173

163174
# If you need santy check by comparing reference BLAS. It'll be very

Makefile.x86_64

+7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ endif
88
endif
99
endif
1010

11+
ifeq ($(CORE), SKYLAKEX)
12+
ifndef NO_AVX512
13+
CCOMMON_OPT += -march=skylake-avx512
14+
FCOMMON_OPT += -march=skylake-avx512
15+
endif
16+
endif
17+
1118
ifeq ($(OSNAME), Interix)
1219
ARFLAGS = -m x64
1320
endif

cblas.h

+5
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
8282
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
8383
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
8484

85+
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
86+
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
87+
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
88+
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
89+
8590
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
8691
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
8792
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

common_stackalloc.h

+9-8
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4747
* - large enough to support all architectures and kernel
4848
* Chosing a too small SIZE will lead to a stack smashing.
4949
*/
50-
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
51-
/* make it volatile because some function (ex: dgemv_n.S) */ \
52-
/* do not restore all register */ \
53-
volatile int stack_alloc_size = SIZE; \
54-
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
55-
stack_alloc_size = 0; \
56-
STACK_ALLOC_PROTECT_SET \
57-
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
50+
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
51+
/* make it volatile because some function (ex: dgemv_n.S) */ \
52+
/* do not restore all register */ \
53+
volatile int stack_alloc_size = SIZE; \
54+
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
55+
STACK_ALLOC_PROTECT_SET \
56+
/* Avoid declaring an array of length 0 */ \
57+
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
58+
__attribute__((aligned(0x20))); \
5859
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
5960
#else
6061
//Original OpenBLAS/GotoBLAS codes.

common_x86_64.h

+7-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,13 @@
6060
#endif
6161
*/
6262

63-
#define MB
64-
#define WMB
63+
#ifdef __GNUC__
64+
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
65+
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
66+
#else
67+
#define MB do {} while (0)
68+
#define WMB do {} while (0)
69+
#endif
6570

6671
static void __inline blas_lock(volatile BLASULONG *address){
6772

cpuid_x86.c

+17
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,23 @@ int get_cpuname(void){
13391339
return CPUTYPE_NEHALEM;
13401340
}
13411341
break;
1342+
case 6:
1343+
switch (model) {
1344+
case 6: // Cannon Lake
1345+
#ifndef NO_AVX512
1346+
return CPUTYPE_SKYLAKEX;
1347+
#else
1348+
if(support_avx())
1349+
#ifndef NO_AVX2
1350+
return CPUTYPE_HASWELL;
1351+
#else
1352+
return CPUTYPE_SANDYBRIDGE;
1353+
#endif
1354+
else
1355+
return CPUTYPE_NEHALEM;
1356+
#endif
1357+
}
1358+
break;
13421359
case 9:
13431360
case 8:
13441361
switch (model) {

ctest/Makefile

+7-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,13 @@ clean ::
102102
rm -f x*
103103

104104
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
105-
CEXTRALIB =
105+
ifeq ($(USE_OPENMP), 1)
106+
ifeq ($(F_COMPILER), GFORTRAN)
107+
ifeq ($(C_COMPILER), CLANG)
108+
CEXTRALIB = -lomp
109+
endif
110+
endif
111+
endif
106112

107113
# Single real
108114
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

driver/level3/level3_thread.c

+18-22
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,7 @@
9191
#endif
9292

9393
typedef struct {
94-
#if __STDC_VERSION__ >= 201112L
95-
_Atomic
96-
#else
9794
volatile
98-
#endif
9995
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
10096
} job_t;
10197

@@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
348344
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
349345
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
350346

351-
/* Make sure if no one is using workspace */
352-
START_RPCC();
353-
for (i = 0; i < args -> nthreads; i++)
354-
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
355-
STOP_RPCC(waiting1);
356-
357347
#if defined(FUSED_GEMM) && !defined(TIMING)
358348

359349
/* Fused operation to copy region of B into workspace and apply kernel */
@@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
391381
}
392382
#endif
393383

394-
/* Set flag so other threads can access local region of B */
395-
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
384+
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
385+
/* Make sure if no one is using workspace */
386+
START_RPCC();
387+
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
388+
STOP_RPCC(waiting1);
389+
/* Set flag so other threads can access local region of B */
396390
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
397-
WMB;
391+
WMB;
392+
}
398393
}
399394

400395
/* Get regions of B from other threads and apply kernel */
@@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
413408

414409
/* Wait until other region of B is initialized */
415410
START_RPCC();
416-
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
411+
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
417412
STOP_RPCC(waiting2);
418413

419414
/* Apply kernel with local region of A and part of other region of B */
@@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
430425

431426
/* Clear synchronization flag if this thread is done with other region of B */
432427
if (m_to - m_from == min_i) {
433-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
428+
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
429+
WMB;
434430
}
435431
}
436432
} while (current != mypos);
437433

438-
/* Iterate through steps of m
434+
/* Iterate through steps of m
439435
* Note: First step has already been finished */
440436
for(is = m_from + min_i; is < m_to; is += min_i){
441437
min_i = m_to - is;
@@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
465461
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
466462
c, ldc, is, js);
467463
STOP_RPCC(kernel);
468-
464+
469465
#ifdef TIMING
470466
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
471467
#endif
472-
468+
473469
/* Clear synchronization flag if this thread is done with region of B */
474470
if (is + min_i >= m_to) {
475-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
471+
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
476472
WMB;
477473
}
478474
}
@@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
492488
START_RPCC();
493489
for (i = 0; i < args -> nthreads; i++) {
494490
for (js = 0; js < DIVIDE_RATE; js++) {
495-
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
491+
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
496492
}
497493
}
498494
STOP_RPCC(waiting3);
@@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
658654
}
659655

660656
/* Clear synchronization flags */
661-
for (i = 0; i < MAX_CPU_NUMBER; i++) {
662-
for (j = 0; j < MAX_CPU_NUMBER; j++) {
657+
for (i = 0; i < nthreads; i++) {
658+
for (j = 0; j < nthreads; j++) {
663659
for (k = 0; k < DIVIDE_RATE; k++) {
664660
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
665661
}

driver/others/blas_server_omp.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848

4949
#else
5050

51+
#ifndef OMP_SCHED
52+
#define OMP_SCHED static
53+
#endif
54+
5155
int blas_server_avail = 0;
5256

5357
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
331335
break;
332336
}
333337

334-
#pragma omp parallel for schedule(static)
338+
#pragma omp parallel for schedule(OMP_SCHED)
335339
for (i = 0; i < num; i ++) {
336340

337341
#ifndef USE_SIMPLE_THREADED_LEVEL3

0 commit comments

Comments
 (0)