Skip to content

Commit 9b3965b

Browse files
authored
Merge pull request #6 from xianyi/develop
rebase
2 parents 114eb15 + 531cb4f commit 9b3965b

10 files changed

+209
-27
lines changed

Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ endif
5959
@$(CC) --version > /dev/null 2>&1;\
6060
if [ $$? -eq 0 ]; then \
6161
cverinfo=`$(CC) --version | sed -n '1p'`; \
62+
if [ -z "$${cverinfo}" ]; then \
63+
cverinfo=`$(CC) --version | sed -n '2p'`; \
64+
fi; \
6265
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
6366
else \
6467
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
@@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
6770
@$(FC) --version > /dev/null 2>&1;\
6871
if [ $$? -eq 0 ]; then \
6972
fverinfo=`$(FC) --version | sed -n '1p'`; \
73+
if [ -z "$${fverinfo}" ]; then \
74+
fverinfo=`$(FC) --version | sed -n '2p'`; \
75+
fi; \
7076
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
7177
else \
7278
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\

Makefile.power

+2
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ USE_OPENMP = 1
1010
endif
1111

1212
ifeq ($(CORE), POWER10)
13+
ifneq ($(C_COMPILER), PGI)
1314
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
1415
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
1516
endif
17+
endif
1618

1719
ifeq ($(CORE), POWER9)
1820
ifneq ($(C_COMPILER), PGI)

Makefile.system

+24-3
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ endif
181181

182182
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
183183
ifeq ($(HOSTARCH), x86_64)
184-
ifeq ($(findstring pgcc,$(HOSTCC)),)
184+
ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
185185
GETARCH_FLAGS += -march=native
186186
endif
187187
endif
@@ -663,6 +663,7 @@ endif
663663
endif # ARCH zarch
664664

665665
ifeq ($(ARCH), power)
666+
ifneq ($(C_COMPILER), PGI)
666667
DYNAMIC_CORE = POWER6
667668
DYNAMIC_CORE += POWER8
668669
ifneq ($(C_COMPILER), GCC)
@@ -689,6 +690,10 @@ else
689690
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
690691
endif
691692
endif
693+
else
694+
DYNAMIC_CORE = POWER8
695+
DYNAMIC_CORE += POWER9
696+
endif
692697
endif
693698

694699
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@@ -847,9 +852,19 @@ endif
847852
endif
848853

849854
ifeq ($(C_COMPILER), PGI)
855+
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
856+
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
857+
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
858+
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
859+
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
860+
NEWPGI := 1
861+
endif
850862
ifdef BINARY64
851863
ifeq ($(ARCH), x86_64)
852-
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
864+
CCOMMON_OPT += -tp p7-64
865+
ifneq ($(NEWPGI),1)
866+
CCOMMON_OPT += -D__MMX__ -Mnollvm
867+
endif
853868
else
854869
ifeq ($(ARCH), power)
855870
ifeq ($(CORE), POWER8)
@@ -1029,18 +1044,24 @@ ifeq ($(ARCH), x86_64)
10291044
FCOMMON_OPT += -tp p7-64
10301045
else
10311046
ifeq ($(ARCH), power)
1047+
ifeq ($(CORE), POWER6)
1048+
$(warning NVIDIA HPC compilers do not support POWER6.)
1049+
endif
10321050
ifeq ($(CORE), POWER8)
10331051
FCOMMON_OPT += -tp pwr8
10341052
endif
10351053
ifeq ($(CORE), POWER9)
10361054
FCOMMON_OPT += -tp pwr9
10371055
endif
1056+
ifeq ($(CORE), POWER10)
1057+
$(warning NVIDIA HPC compilers do not support POWER10.)
1058+
endif
10381059
endif
10391060
endif
10401061
else
10411062
FCOMMON_OPT += -tp p7
10421063
endif
1043-
FCOMMON_OPT += -Mrecursive
1064+
FCOMMON_OPT += -Mrecursive -Kieee
10441065
ifeq ($(USE_OPENMP), 1)
10451066
FCOMMON_OPT += -mp
10461067
endif

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta
1313

1414
## Introduction
1515

16-
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
16+
OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version.
1717

1818
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
1919

driver/others/dynamic_power.c

+151
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ static char *corename[] = {
2727
#define NUM_CORETYPES 4
2828

2929
char *gotoblas_corename(void) {
30+
#ifndef C_PGI
3031
if (gotoblas == &gotoblas_POWER6) return corename[1];
32+
#endif
3133
if (gotoblas == &gotoblas_POWER8) return corename[2];
3234
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
3335
if (gotoblas == &gotoblas_POWER9) return corename[3];
@@ -38,10 +40,157 @@ char *gotoblas_corename(void) {
3840
return corename[0];
3941
}
4042

43+
#ifdef C_PGI
44+
/*
45+
* NV HPC compilers do not yet implement __builtin_cpu_is().
46+
* Fake a version here for use in the CPU detection code below.
47+
*
48+
* Strategy here is to first check the CPU to see what it actually is,
49+
* and then test the input to see if what the CPU actually is matches
50+
* what was requested.
51+
*/
52+
53+
#include <string.h>
54+
55+
/*
56+
* Define POWER processor version table.
57+
*
58+
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
59+
*/
60+
61+
#define CPU_UNKNOWN 0
62+
#define CPU_POWER5 5
63+
#define CPU_POWER6 6
64+
#define CPU_POWER8 8
65+
#define CPU_POWER9 9
66+
#define CPU_POWER10 10
67+
68+
static struct {
69+
uint32_t pvr_mask;
70+
uint32_t pvr_value;
71+
const char* cpu_name;
72+
uint32_t cpu_type;
73+
} pvrPOWER [] = {
74+
75+
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
76+
.pvr_mask = 0xffffffff,
77+
.pvr_value = 0x0f000001,
78+
.cpu_name = "POWER5+",
79+
.cpu_type = CPU_POWER5,
80+
},
81+
82+
{ /* Power6 aka POWER6X*/
83+
.pvr_mask = 0xffff0000,
84+
.pvr_value = 0x003e0000,
85+
.cpu_name = "POWER6 (raw)",
86+
.cpu_type = CPU_POWER6,
87+
},
88+
89+
{ /* Power7 */
90+
.pvr_mask = 0xffff0000,
91+
.pvr_value = 0x003f0000,
92+
.cpu_name = "POWER7 (raw)",
93+
.cpu_type = CPU_POWER6,
94+
},
95+
96+
{ /* Power7+ */
97+
.pvr_mask = 0xffff0000,
98+
.pvr_value = 0x004A0000,
99+
.cpu_name = "POWER7+ (raw)",
100+
.cpu_type = CPU_POWER6,
101+
},
102+
103+
{ /* Power8E */
104+
.pvr_mask = 0xffff0000,
105+
.pvr_value = 0x004b0000,
106+
.cpu_name = "POWER8E (raw)",
107+
.cpu_type = CPU_POWER8,
108+
},
109+
110+
{ /* Power8NVL */
111+
.pvr_mask = 0xffff0000,
112+
.pvr_value = 0x004c0000,
113+
.cpu_name = "POWER8NVL (raw)",
114+
.cpu_type = CPU_POWER8,
115+
},
116+
117+
{ /* Power8 */
118+
.pvr_mask = 0xffff0000,
119+
.pvr_value = 0x004d0000,
120+
.cpu_name = "POWER8 (raw)",
121+
.cpu_type = CPU_POWER8,
122+
},
123+
124+
{ /* Power9 DD2.0 */
125+
.pvr_mask = 0xffffefff,
126+
.pvr_value = 0x004e0200,
127+
.cpu_name = "POWER9 (raw)",
128+
.cpu_type = CPU_POWER9,
129+
},
130+
131+
{ /* Power9 DD 2.1 */
132+
.pvr_mask = 0xffffefff,
133+
.pvr_value = 0x004e0201,
134+
.cpu_name = "POWER9 (raw)",
135+
.cpu_type = CPU_POWER9,
136+
},
137+
138+
{ /* Power9 DD2.2 or later */
139+
.pvr_mask = 0xffff0000,
140+
.pvr_value = 0x004e0000,
141+
.cpu_name = "POWER9 (raw)",
142+
.cpu_type = CPU_POWER9,
143+
},
144+
145+
{ /* Power10 */
146+
.pvr_mask = 0xffff0000,
147+
.pvr_value = 0x00800000,
148+
.cpu_name = "POWER10 (raw)",
149+
.cpu_type = CPU_POWER10,
150+
},
151+
152+
{ /* End of table, pvr_mask and pvr_value must be zero */
153+
.pvr_mask = 0x0,
154+
.pvr_value = 0x0,
155+
.cpu_name = "Unknown",
156+
.cpu_type = CPU_UNKNOWN,
157+
},
158+
};
159+
160+
static int __builtin_cpu_is(const char *cpu) {
161+
int i;
162+
uint32_t pvr;
163+
uint32_t cpu_type;
164+
165+
asm("mfpvr %0" : "=r"(pvr));
166+
167+
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
168+
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
169+
break;
170+
}
171+
}
172+
173+
#if defined(DEBUG)
174+
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
175+
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
176+
#endif
177+
cpu_type = pvrPOWER[i].cpu_type;
178+
179+
if (!strcmp(cpu, "power8"))
180+
return cpu_type == CPU_POWER8;
181+
if (!strcmp(cpu, "power9"))
182+
return cpu_type == CPU_POWER9;
183+
return 0;
184+
}
185+
186+
#endif /* C_PGI */
187+
41188
static gotoblas_t *get_coretype(void) {
42189

190+
#ifndef C_PGI
43191
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
44192
return &gotoblas_POWER6;
193+
#endif
45194
if (__builtin_cpu_is("power8"))
46195
return &gotoblas_POWER8;
47196
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
@@ -77,7 +226,9 @@ static gotoblas_t *force_coretype(char * coretype) {
77226

78227
switch (found)
79228
{
229+
#ifndef C_PGI
80230
case 1: return (&gotoblas_POWER6);
231+
#endif
81232
case 2: return (&gotoblas_POWER8);
82233
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
83234
case 3: return (&gotoblas_POWER9);

f_check

+5-6
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ if ($compiler eq "") {
3232
"xlf95", "xlf90", "xlf",
3333
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
3434
"pathf90", "pathf95",
35-
"pgf95", "pgf90", "pgf77",
35+
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
3636
"flang", "egfortran",
3737
"ifort");
3838

@@ -64,7 +64,6 @@ if ($compiler eq "") {
6464
if (!$?) {
6565

6666
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
67-
6867
if ($data =~ /zhoge_/) {
6968
$bu = "_";
7069
}
@@ -87,7 +86,7 @@ if ($compiler eq "") {
8786
if ($compiler =~ /flang/) {
8887
$vendor = FLANG;
8988
$openmp = "-fopenmp";
90-
} elsif ($compiler =~ /pgf/) {
89+
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
9190
$vendor = PGI;
9291
$openmp = "-mp";
9392
} else {
@@ -123,7 +122,7 @@ if ($compiler eq "") {
123122
$openmp = "-mp";
124123
}
125124

126-
if ($data =~ /PGF/) {
125+
if ($data =~ /PGF/ || $data =~ /NVF/) {
127126
$vendor = PGI;
128127
$openmp = "-mp";
129128
}
@@ -177,7 +176,7 @@ if ($compiler eq "") {
177176
$openmp = "-mp";
178177
}
179178

180-
if ($compiler =~ /pgf/) {
179+
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
181180
$vendor = PGI;
182181
$bu = "_";
183182
$openmp = "-mp";
@@ -330,7 +329,7 @@ if ($link ne "") {
330329
$flags =~ s/\@/\,/g;
331330
$linker_L .= "-Wl,". $flags . " " ;
332331
}
333-
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
332+
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
334333
$flags = "-lomp";
335334
}
336335

kernel/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
3636
ifeq ($(GCCVERSIONGTEQ10), 1)
3737
override CFLAGS += -march=cooperlake
3838
else
39-
override CFLAGS += -march=skylake-avx512
39+
override CFLAGS += -march=skylake-avx512 -mavx512f
4040
endif
4141
ifeq ($(OSNAME), CYGWIN_NT)
4242
override CFLAGS += -fno-asynchronous-unwind-tables
@@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
4747
endif
4848
endif
4949
else ifeq ($(TARGET_CORE), SKYLAKEX)
50-
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
50+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
5151
ifeq ($(OSNAME), CYGWIN_NT)
5252
override CFLAGS += -fno-asynchronous-unwind-tables
5353
endif

kernel/arm64/KERNEL.NEOVERSEN1

+4-4
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
9191
ICAMAXKERNEL = izamax_thunderx2t99.c
9292
IZAMAXKERNEL = izamax_thunderx2t99.c
9393

94-
SNRM2KERNEL = scnrm2_thunderx2t99.c
95-
DNRM2KERNEL = dznrm2_thunderx2t99.c
96-
CNRM2KERNEL = scnrm2_thunderx2t99.c
97-
ZNRM2KERNEL = dznrm2_thunderx2t99.c
94+
SNRM2KERNEL = nrm2.S
95+
DNRM2KERNEL = nrm2.S
96+
CNRM2KERNEL = znrm2.S
97+
ZNRM2KERNEL = znrm2.S
9898

9999
DDOTKERNEL = dot_thunderx2t99.c
100100
SDOTKERNEL = dot_thunderx2t99.c

kernel/arm64/KERNEL.THUNDERX2T99

+4-4
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
153153
ICAMAXKERNEL = izamax_thunderx2t99.c
154154
IZAMAXKERNEL = izamax_thunderx2t99.c
155155

156-
SNRM2KERNEL = scnrm2_thunderx2t99.c
157-
CNRM2KERNEL = scnrm2_thunderx2t99.c
156+
SNRM2KERNEL = nrm2.S
157+
CNRM2KERNEL = nrm2.S
158158
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
159159
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
160-
DNRM2KERNEL = dznrm2_thunderx2t99.c
161-
ZNRM2KERNEL = dznrm2_thunderx2t99.c
160+
DNRM2KERNEL = znrm2.S
161+
ZNRM2KERNEL = znrm2.S
162162

163163

164164
DDOTKERNEL = dot_thunderx2t99.c

0 commit comments

Comments
 (0)