Skip to content

Commit ec0cac1

Browse files
authored
Merge pull request #4 from xianyi/develop
Update branch
2 parents 61659f8 + fd081a9 commit ec0cac1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+2839
-219
lines changed

CMakeLists.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 1.dev)
9+
set(OpenBLAS_PATCH_VERSION 4.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions
@@ -150,6 +150,7 @@ endif()
150150

151151
# add objects to the openblas lib
152152
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
153+
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
153154

154155
# Android needs to explicitly link against libm
155156
if(ANDROID)
@@ -169,6 +170,7 @@ endif()
169170
# Set output for libopenblas
170171
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
171172
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
173+
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
172174

173175
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
174176
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )

Changelog.txt

+138
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,142 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.3
4+
31-Aug-2018
5+
6+
common:
7+
* thread memory allocation has been switched back to the method
8+
used before version 0.3.1 due to unexpected problems caused by
9+
the new code under some circumstances. A new compile-time option
10+
USE_TLS has been added to enable the new code, and it is hoped
11+
that this can become the default again in the next version.
12+
* LAPAck PR272 has been integrated, which fixes spurious errors
13+
in DSYEVR and related functions caused by missing conversion
14+
from ILAENV to ILAENV_2STAGE in several _2stage routines.
15+
* the cmake-generated OpenBLASConfig.cmake now uses correct case
16+
for the name of the library
17+
* added support for Haiku OS
18+
19+
x86_64:
20+
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
21+
DSCAL, DGEMVN and DSYMVL
22+
* added a workaround for a cygwin issue that prevented compilation
23+
of AVX512 code
24+
25+
IBM Z:
26+
* added autodetection of Z14
27+
* fixed TRMM errors in the generic target
28+
29+
====================================================================
30+
Version 0.3.2
31+
30-Jul-2018
32+
33+
common:
34+
* fixes for regressions caused by the rewrite of the thread
35+
initialization code in 0.3.1
36+
37+
POWER:
38+
* fixed cpu autodetection for the BSDs
39+
40+
MIPS64:
41+
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
42+
43+
x86_64:
44+
* added autodetection of AMD Ryzen 2
45+
* fixed build with older versions of MSVC
46+
47+
====================================================================
48+
Version 0.3.1
49+
01-Jul-2018
50+
51+
common:
52+
* rewritten thread initialization code with significantly reduced overhead
53+
* added CBLAS interfaces to the IxAMIN BLAS extension functions
54+
* fixed the lapack-test target
55+
* CMAKE builds now create an OpenBLASConfig.cmake file
56+
* ZAXPY now uses a single thread for small input sizes
57+
* the LAPACK code was updated from Reference-LAPACK/lapack#253
58+
(fixing LAPACKE interfaces to Aasen's functions)
59+
60+
POWER:
61+
* corrected CROT and ZROT behaviour with zero INC_X
62+
63+
ARMV7:
64+
* corrected xDOT behaviour with zero INC_X or INC_Y
65+
66+
x86_64:
67+
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
68+
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
69+
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
70+
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
71+
specify the list of x86_64 targets to include. Any target not on the list will be supported
72+
by the Sandybridge or Nehalem kernels if available, or by Prescott.
73+
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
74+
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
75+
* added autodetection of Intel Cannon Lake series as Skylake X
76+
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
77+
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
78+
recent mingw from MSYS2
79+
* fixed a link error in mixed clang/gfortran builds with OpenMP
80+
* updated the OSX deployment target to 10.8
81+
* switched on parallel make for builds on MS Windows by default
82+
83+
x86:
84+
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
85+
86+
====================================================================
87+
Version 0.3.0
88+
23-May-2108
89+
90+
common:
91+
* fixed some more thread race and locking bugs
92+
* added preliminary support for calling an OpenMP build of the library from multiple threads
93+
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
94+
* general code cleanup
95+
* optimized DSDOT implementation
96+
* improved thread distribution for GEMM
97+
* corrected IMATCOPY/OMATCOPY implementation
98+
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
99+
* cmake build improvements
100+
* pkgconfig file now contains build options
101+
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
102+
* corrections and improvements for systems with more than 64 cpus
103+
* LAPACK code updated to 3.8.0 including later fixes
104+
* added ReLAPACK, a recursive implementation of several LAPACK functions
105+
* Rewrote ROTMG to handle cases that the netlib code failed to address
106+
* Disabled (broken) multithreading code for xTRMV
107+
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
108+
* shared memory access failures on startup are now handled more gracefully
109+
* restored utests from earlier releases (and made them pass on all affected systems)
110+
111+
SPARC:
112+
* several fixes for cpu autodetection
113+
114+
POWER:
115+
* corrected vector register overwriting in several Power8 kernels
116+
* optimized additional BLAS functions
117+
118+
ARM:
119+
* added support for CortexA53 and A72
120+
* added autodetection for ThunderX2T99
121+
* made most optimized kernels the default for generic ARMv8 targets
122+
123+
x86_64:
124+
* parallelized DDOT kernel for Haswell
125+
* changed alignment directives in assembly kernels to boost performance on OSX
126+
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
127+
* added support for building on OpenBSD and Dragonfly
128+
* updated compiler options to work with Intel release 2018
129+
* support fully optimized build with clang/flang on Microsoft Windows
130+
* fixed building on AIX
131+
132+
IBM Z:
133+
* added optimized BLAS 1/2 functions
134+
135+
MIPS:
136+
* fixed cpu autodetection helper code
137+
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
138+
* added mips64 I6500 cpu
139+
2140
====================================================================
3141
Version 0.2.20
4142
24-Jul-2017

Makefile

+3-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ endif
9797

9898
shared :
9999
ifndef NO_SHARED
100-
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
100+
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
101101
@$(MAKE) -C exports so
102102
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
103103
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
267267
ifdef SMP
268268
ifeq ($(OSNAME), WINNT)
269269
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
270+
else ifeq ($(OSNAME), Haiku)
271+
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
270272
else
271273
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
272274
endif

Makefile.install

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ endif
6666
#for install shared library
6767
ifndef NO_SHARED
6868
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
69-
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
69+
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
7070
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
7171
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
7272
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \

Makefile.rule

+8-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.1.dev
6+
VERSION = 0.3.4.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1
107107
# BUILD_RELAPACK = 1
108108

109109
# If you want to use legacy threaded Level 3 implementation.
110-
# USE_SIMPLE_THREADED_LEVEL3 = 1
110+
USE_SIMPLE_THREADED_LEVEL3 = 1
111+
112+
# If you want to use the new, still somewhat experimental code that uses
113+
# thread-local storage instead of a central memory buffer in memory.c
114+
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
115+
# for this to work.
116+
USE_TLS = 1
111117

112118
# If you want to drive whole 64bit region by BLAS. Not all Fortran
113119
# compiler supports this. It's safe to keep comment it out if you

Makefile.system

+4
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
10181018
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
10191019
endif
10201020

1021+
ifdef USE_TLS
1022+
CCOMMON_OPT += -DUSE_TLS
1023+
endif
1024+
10211025
ifndef SYMBOLPREFIX
10221026
SYMBOLPREFIX =
10231027
endif

Makefile.x86_64

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
1212
ifndef NO_AVX512
1313
CCOMMON_OPT += -march=skylake-avx512
1414
FCOMMON_OPT += -march=skylake-avx512
15+
ifeq ($(OSNAME), CYGWIN_NT)
16+
CCOMMON_OPT += -fno-asynchronous-unwind-tables
17+
endif
1518
endif
1619
endif
1720

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
110110
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
111111
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
112112
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
113+
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
113114
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
114115
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
115116
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
200201
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
201202
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
202203
Clang 3.0 will generate the wrong AVX binary code.
204+
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
203205
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
204206
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
205207
the library with `BIGNUMA=1`.

benchmark/gemv.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
122122

123123
FLOAT *a, *x, *y;
124124
FLOAT alpha[] = {1.0, 1.0};
125-
FLOAT beta [] = {1.0, 1.0};
125+
FLOAT beta [] = {1.0, 0.0};
126126
char trans='N';
127127
blasint m, i, j;
128128
blasint inc_x=1,inc_y=1;

c_check

+3-1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
6464
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
6565
$os = Interix if ($data =~ /OS_INTERIX/);
6666
$os = Android if ($data =~ /OS_ANDROID/);
67+
$os = Haiku if ($data =~ /OS_HAIKU/);
6768

6869
$architecture = x86 if ($data =~ /ARCH_X86/);
6970
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@@ -223,14 +224,15 @@ $data =~ /globl\s([_\.]*)(.*)/;
223224
$need_fu = $1;
224225

225226
$cross = 0;
226-
$cross = 1 if ($os ne $hostos);
227227

228228
if ($architecture ne $hostarch) {
229229
$cross = 1;
230230
$cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86"));
231231
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
232232
}
233233

234+
$cross = 1 if ($os ne $hostos);
235+
234236
$openmp = "" if $ENV{USE_OPENMP} != 1;
235237

236238
$linker_L = "";

cblas.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
5151
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
5252
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
5353
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
54-
54+
typedef CBLAS_ORDER CBLAS_LAYOUT;
55+
5556
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
5657
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
5758
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);

cmake/prebuild.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
8585
endif ()
8686

8787
# Cannot run getarch on target if we are cross-compiling
88-
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
88+
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
8989
# Write to config as getarch would
9090

9191
# TODO: Set up defines that getarch sets up based on every other target

cmake/system.cmake

+4
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR)
214214
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
215215
endif ()
216216

217+
if (USE_TLS)
218+
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
219+
endif ()
220+
217221
# Only for development
218222
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
219223
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")

cmake/system_check.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ endif()
6868

6969
if (X86_64 OR X86)
7070
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
71-
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
71+
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
7272
if (NO_AVX512 EQUAL 1)
7373
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
7474
endif()

common.h

+10
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ extern "C" {
105105
#endif
106106
#endif
107107

108+
#ifdef OS_HAIKU
109+
#define NO_SYSV_IPC
110+
#endif
111+
108112
#ifdef OS_WINDOWS
109113
#ifdef ATOM
110114
#define GOTO_ATOM ATOM
@@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
253257

254258
#ifdef USE64BITINT
255259
typedef BLASLONG blasint;
260+
#if defined(OS_WINDOWS) && defined(__64BIT__)
261+
#define blasabs(x) llabs(x)
262+
#else
263+
#define blasabs(x) labs(x)
264+
#endif
256265
#else
257266
typedef int blasint;
267+
#define blasabs(x) abs(x)
258268
#endif
259269
#else
260270
#ifdef USE64BITINT

0 commit comments

Comments
 (0)