Skip to content

Commit 4f43668

Browse files
authored
Merge pull request #2 from xianyi/develop
merge develop
2 parents 76a66ea + b0c15ba commit 4f43668

16 files changed

+127
-12
lines changed

Makefile.install

+29
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ ifndef NO_CBLAS
4848
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
4949
endif
5050

51+
ifneq ($(OSNAME), AIX)
5152
ifndef NO_LAPACKE
5253
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
5354
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
7273
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
7374
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
7475
endif
76+
7577
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
7678
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
7779
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
9395
endif
9496
endif
9597

98+
else
99+
#install on AIX has different options syntax
100+
ifndef NO_LAPACKE
101+
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
102+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
103+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
104+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
105+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
106+
endif
107+
108+
#for install static library
109+
ifndef NO_STATIC
110+
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
111+
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
112+
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
113+
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
114+
endif
115+
#for install shared library
116+
ifndef NO_SHARED
117+
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
118+
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
119+
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
120+
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
121+
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
122+
endif
123+
124+
endif
96125

97126
#Generating openblas.pc
98127
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"

Makefile.rule

+2-2
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,8 @@ NO_AFFINITY = 1
192192
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
193193
# COMMON_OPT = -O2
194194

195-
# gfortran option for LAPACK
196-
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
195+
# gfortran option for LAPACK to improve thread-safety
196+
# It is enabled by default in Makefile.system for gfortran
197197
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
198198
# FCOMMON_OPT = -frecursive
199199

Makefile.system

+6
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,8 @@ endif
725725
ifeq ($(F_COMPILER), GFORTRAN)
726726
CCOMMON_OPT += -DF_INTERFACE_GFORT
727727
FCOMMON_OPT += -Wall
728+
# make single-threaded LAPACK calls thread-safe #1847
729+
FCOMMON_OPT += -frecursive
728730
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
729731
ifneq ($(NO_LAPACK), 1)
730732
EXTRALIB += -lgfortran
@@ -1211,7 +1213,11 @@ endif
12111213

12121214
LIBDLLNAME = $(LIBPREFIX).dll
12131215
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
1216+
ifneq ($(OSNAME), AIX)
12141217
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
1218+
else
1219+
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
1220+
endif
12151221
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
12161222
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
12171223
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

cmake/fc.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ endif ()
4444

4545
if (${F_COMPILER} STREQUAL "GFORTRAN")
4646
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
47-
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
47+
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
4848
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
4949
if (NOT NO_LAPACK)
5050
set(EXTRALIB "{EXTRALIB} -lgfortran")

common.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ extern "C" {
183183

184184
#define ALLOCA_ALIGN 63UL
185185

186-
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
186+
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
187187

188188
#ifdef NEEDBUNDERSCORE
189189
#define BLASFUNC(FUNC) FUNC##_

cpuid_arm64.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ int detect(void)
115115
fclose(infile);
116116
if(cpu_part != NULL && cpu_implementer != NULL) {
117117
if (strstr(cpu_implementer, "0x41") &&
118-
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") ))
119-
return CPU_CORTEXA57; //or compatible A53, A72
118+
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08")))
119+
return CPU_CORTEXA57; //or compatible, ex. A72
120120
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
121121
return CPU_VULCAN;
122122
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))

cpuid_x86.c

+2
Original file line numberDiff line numberDiff line change
@@ -2009,6 +2009,8 @@ int get_coretype(void){
20092009
switch (model) {
20102010
case 1:
20112011
// AMD Ryzen
2012+
case 8:
2013+
// Ryzen 2
20122014
if(support_avx())
20132015
#ifndef NO_AVX2
20142016
return CORE_ZEN;

driver/level3/level3_thread.c

+22
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
#define SWITCH_RATIO 2
4949
#endif
5050

51+
#ifndef GEMM_PREFERED_SIZE
52+
#define GEMM_PREFERED_SIZE 1
53+
#endif
54+
5155
//The array of job_t may overflow the stack.
5256
//Instead, use malloc to alloc job_t.
5357
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
510514
return 0;
511515
}
512516

517+
static int round_up(int remainder, int width, int multiple)
518+
{
519+
if (multiple > remainder || width <= multiple)
520+
return width;
521+
width = (width + multiple - 1) / multiple;
522+
width = width * multiple;
523+
return width;
524+
}
525+
526+
513527
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
514528
*range_n, FLOAT *sa, FLOAT *sb,
515529
BLASLONG nthreads_m, BLASLONG nthreads_n) {
@@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
601615
num_parts = 0;
602616
while (m > 0){
603617
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
618+
619+
width = round_up(m, width, GEMM_PREFERED_SIZE);
620+
604621
m -= width;
622+
605623
if (m < 0) width = width + m;
606624
range_M[num_parts + 1] = range_M[num_parts] + width;
625+
607626
num_parts ++;
608627
}
609628
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
645664
if (width < SWITCH_RATIO) {
646665
width = SWITCH_RATIO;
647666
}
667+
width = round_up(n, width, GEMM_PREFERED_SIZE);
668+
648669
n -= width;
649670
if (n < 0) width = width + n;
650671
range_N[num_parts + 1] = range_N[num_parts] + width;
672+
651673
num_parts ++;
652674
}
653675
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {

driver/others/blas_server.c

+5
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
850850

851851
long i;
852852

853+
#ifdef SMP_SERVER
854+
// Handle lazy re-init of the thread-pool after a POSIX fork
855+
if (unlikely(blas_server_avail == 0)) blas_thread_init();
856+
#endif
857+
853858
if (num_threads < 1) num_threads = blas_num_threads;
854859

855860
#ifndef NO_AFFINITY

driver/others/blas_server_win32.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
478478

479479
void goto_set_num_threads(int num_threads)
480480
{
481-
long i;
481+
long i;
482+
483+
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
484+
// Handle lazy re-init of the thread-pool after a POSIX fork
485+
if (unlikely(blas_server_avail == 0)) blas_thread_init();
486+
#endif
482487

483488
if (num_threads < 1) num_threads = blas_cpu_number;
484489

driver/others/memory.c

+26
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,16 @@ int get_num_procs(void) {
259259
}
260260
#endif
261261

262+
#ifdef OS_AIX
263+
int get_num_procs(void) {
264+
static int nums = 0;
265+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
266+
return nums;
267+
}
268+
#endif
269+
270+
271+
262272
#ifdef OS_WINDOWS
263273

264274
int get_num_procs(void) {
@@ -1738,6 +1748,22 @@ int get_num_procs(void) {
17381748
return nums;
17391749
}
17401750
#endif
1751+
1752+
#ifdef OS_HAIKU
1753+
int get_num_procs(void) {
1754+
static int nums = 0;
1755+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1756+
return nums;
1757+
}
1758+
#endif
1759+
1760+
#ifdef OS_AIX
1761+
int get_num_procs(void) {
1762+
static int nums = 0;
1763+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1764+
return nums;
1765+
}
1766+
#endif
17411767

17421768
#ifdef OS_WINDOWS
17431769

interface/zswap.c

+9-1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@
4242
#include "functable.h"
4343
#endif
4444

45+
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
46+
// Multithreaded swap gives performance benefits in ThunderX2T99
47+
#else
48+
// Disable multi-threading as it does not show any performance
49+
// benefits. Keep the multi-threading code for the record.
50+
#undef SMP
51+
#endif
52+
4553
#ifndef CBLAS
4654

4755
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@@ -81,7 +89,7 @@ FLOAT *y = (FLOAT*)vy;
8189
#ifdef SMP
8290
//disable multi-thread when incx==0 or incy==0
8391
//In that case, the threads would be dependent.
84-
if (incx == 0 || incy == 0)
92+
if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
8593
nthreads = 1;
8694
else
8795
nthreads = num_cpu_avail(1);

kernel/x86_64/dgemm_beta_skylakex.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
5555
return 0;
5656
}
5757

58+
if (m == 0 || n == 0)
59+
return 0;
5860

5961
c_offset = c;
6062

@@ -69,15 +71,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
6971

7072
i = m;
7173

72-
while (i > 32) {
74+
while (i >= 32) {
7375
_mm512_storeu_pd(c_offset1, z_zero);
7476
_mm512_storeu_pd(c_offset1 + 8, z_zero);
7577
_mm512_storeu_pd(c_offset1 + 16, z_zero);
7678
_mm512_storeu_pd(c_offset1 + 24 , z_zero);
7779
c_offset1 += 32;
7880
i -= 32;
7981
}
80-
while (i > 8) {
82+
while (i >= 8) {
8183
_mm512_storeu_pd(c_offset1, z_zero);
8284
c_offset1 += 8;
8385
i -= 8;

kernel/x86_64/sgemm_beta_skylakex.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
5555
return 0;
5656
}
5757

58+
if (n == 0 || m == 0)
59+
return;
5860

5961
c_offset = c;
6062

@@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
7173

7274
i = m;
7375

74-
while (i > 32) {
76+
while (i >= 32) {
7577
_mm512_storeu_ps(c_offset1, z_zero);
7678
_mm512_storeu_ps(c_offset1 + 16, z_zero);
7779
c_offset1 += 32;
7880
i -= 32;
7981
}
80-
while (i > 8) {
82+
while (i >= 8) {
8183
_mm256_storeu_ps(c_offset1, y_zero);
8284
c_offset1 += 8;
8385
i -= 8;

lapack-netlib/LAPACKE/include/lapacke_config.h

+7
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@
3434
#ifndef _LAPACKE_CONFIG_H_
3535
#define _LAPACKE_CONFIG_H_
3636

37+
// For Android prior to API 21 (no <complex> include)
38+
#if defined(__ANDROID__)
39+
#if __ANDROID_API__ < 21
40+
#define LAPACK_COMPLEX_STRUCTURE
41+
#endif
42+
#endif
43+
3744
#ifdef __cplusplus
3845
#if defined(LAPACK_COMPLEX_CPP)
3946
#include <complex>

param.h

+1
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16271627
#define SYMV_P 8
16281628

16291629
#define SWITCH_RATIO 32
1630+
#define GEMM_PREFERED_SIZE 32
16301631

16311632
#ifdef ARCH_X86
16321633

0 commit comments

Comments
 (0)