Skip to content

Commit 723776d

Browse files
authored
Merge pull request #4 from xianyi/develop
rebase
2 parents 7d81acc + 5a77ec7 commit 723776d

9 files changed

+206
-25
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 12.dev)
9+
set(OpenBLAS_PATCH_VERSION 13.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

Changelog.txt

+50
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,54 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.13
4+
12-Dec-2020
5+
6+
common:
7+
* Added a generic bfloat16 SBGEMV kernel
8+
* Fixed a potentially severe memory leak after fork in OpenMP builds
9+
that was introduced in 0.3.12
10+
* Added detection of the Fujitsu Fortran compiler
11+
* Added detection of the (e)gfortran compiler on OpenBSD
12+
* Added support for overriding the default name of the library independently
13+
from symbol suffixing in the gmake builds (already supported in cmake)
14+
15+
RISCV:
16+
* Added a RISC V port optimized for C910V
17+
18+
POWER:
19+
* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
20+
* Improved DGEMM performance on POWER10
21+
* Improved STRSM and DTRSM performance on POWER9 and POWER10
22+
* Fixed segmemtation faults in DYNAMIC_ARCH builds
23+
* Fixed compilation with the PGI compiler
24+
25+
x86:
26+
* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
27+
28+
x86_64:
29+
* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
30+
* Improved the performance of SASUM and DASUM kernels through parallelization
31+
* Improved the performance of SROT and DROT kernels
32+
* Improved the performance of multithreaded xSYRK
33+
* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
34+
(where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
35+
wrong results)
36+
* Fixed miscompilations by old gcc 4.6
37+
* Fixed misdetection of AVX2 capability in some Sandybridge cpus
38+
* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
39+
40+
ARM64:
41+
* Fixed segmemtation faults in DYNAMIC_ARCH builds
42+
43+
MIPS:
44+
* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
45+
* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
46+
* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
47+
* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
48+
49+
SPARC:
50+
* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
51+
252
====================================================================
353
Version 0.3.12
454
24-Oct-2020

Makefile.rule

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.12.dev
6+
VERSION = 0.3.13.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

driver/others/dynamic_power.c

+2
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,10 @@ static gotoblas_t *get_coretype(void) {
5353
return &gotoblas_POWER10;
5454
#endif
5555
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
56+
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
5657
if (__builtin_cpu_is("power10"))
5758
return &gotoblas_POWER9;
59+
#endif
5860
return NULL;
5961
}
6062

kernel/power/ccopy_microk_power10.c

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL 1
29+
30+
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
31+
{
32+
__asm__
33+
(
34+
"lxvp 32, 0(%2) \n\t"
35+
"lxvp 34, 32(%2) \n\t"
36+
"lxvp 36, 64(%2) \n\t"
37+
"lxvp 38, 96(%2) \n\t"
38+
"lxvp 40, 128(%2) \n\t"
39+
"lxvp 42, 160(%2) \n\t"
40+
"lxvp 44, 192(%2) \n\t"
41+
"lxvp 46, 224(%2) \n\t"
42+
43+
"addi %2, %2, 256 \n\t"
44+
"addic. %1, %1, -32 \n\t"
45+
"ble two%= \n\t"
46+
47+
".align 5 \n"
48+
"one%=: \n\t"
49+
50+
"stxv 33, 0(%3) \n\t"
51+
"stxv 32, 16(%3) \n\t"
52+
"stxv 35, 32(%3) \n\t"
53+
"stxv 34, 48(%3) \n\t"
54+
"stxv 37, 64(%3) \n\t"
55+
"stxv 36, 80(%3) \n\t"
56+
"stxv 39, 96(%3) \n\t"
57+
"stxv 38, 112(%3) \n\t"
58+
"lxvp 32, 0(%2) \n\t"
59+
"lxvp 34, 32(%2) \n\t"
60+
"lxvp 36, 64(%2) \n\t"
61+
"lxvp 38, 96(%2) \n\t"
62+
63+
"stxv 41, 128(%3) \n\t"
64+
"stxv 40, 144(%3) \n\t"
65+
"stxv 43, 160(%3) \n\t"
66+
"stxv 42, 176(%3) \n\t"
67+
"stxv 45, 192(%3) \n\t"
68+
"stxv 44, 208(%3) \n\t"
69+
"stxv 47, 224(%3) \n\t"
70+
"stxv 46, 240(%3) \n\t"
71+
"lxvp 40, 128(%2) \n\t"
72+
"lxvp 42, 160(%2) \n\t"
73+
"lxvp 44, 192(%2) \n\t"
74+
"lxvp 46, 224(%2) \n\t"
75+
76+
77+
"addi %3, %3, 256 \n\t"
78+
"addi %2, %2, 256 \n\t"
79+
80+
"addic. %1, %1, -32 \n\t"
81+
"bgt one%= \n"
82+
83+
"two%=: \n\t"
84+
85+
"stxv 33, 0(%3) \n\t"
86+
"stxv 32, 16(%3) \n\t"
87+
"stxv 35, 32(%3) \n\t"
88+
"stxv 34, 48(%3) \n\t"
89+
"stxv 37, 64(%3) \n\t"
90+
"stxv 36, 80(%3) \n\t"
91+
"stxv 39, 96(%3) \n\t"
92+
"stxv 38, 112(%3) \n\t"
93+
"stxv 41, 128(%3) \n\t"
94+
"stxv 40, 144(%3) \n\t"
95+
"stxv 43, 160(%3) \n\t"
96+
"stxv 42, 176(%3) \n\t"
97+
"stxv 45, 192(%3) \n\t"
98+
"stxv 44, 208(%3) \n\t"
99+
"stxv 47, 224(%3) \n\t"
100+
"stxv 46, 240(%3) \n\t"
101+
102+
"#n=%1 x=%4=%2 y=%0=%3"
103+
:
104+
"=m" (*y),
105+
"+r" (n), // 1
106+
"+b" (x), // 2
107+
"+b" (y) // 3
108+
:
109+
"m" (*x)
110+
:
111+
"cr0",
112+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
113+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
114+
);
115+
}

kernel/power/ccopy_power10.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#include "common.h"
2929

3030
#if defined(__VEC__) || defined(__ALTIVEC__)
31-
#include "copy_microk_power10.c"
31+
#include "ccopy_microk_power10.c"
3232
#endif
3333

3434
#ifndef HAVE_KERNEL
@@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
8686
if ( (inc_x == 1) && (inc_y == 1 ))
8787
{
8888

89-
BLASLONG n1 = n & -64;
89+
BLASLONG n1 = n & -32;
9090
if ( n1 > 0 )
9191
{
9292
copy_kernel(n1, x, y);

kernel/power/copy_microk_power10.c

+13-12
Original file line numberDiff line numberDiff line change
@@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
6262
"one%=: \n\t"
6363

6464
"stxvp 32, 0(%3) \n\t"
65-
"lxvp 32, 0(%2) \n\t"
6665
"stxvp 34, 32(%3) \n\t"
67-
"lxvp 34, 32(%2) \n\t"
6866
"stxvp 36, 64(%3) \n\t"
69-
"lxvp 36, 64(%2) \n\t"
7067
"stxvp 38, 96(%3) \n\t"
68+
"lxvp 32, 0(%2) \n\t"
69+
"lxvp 34, 32(%2) \n\t"
70+
"lxvp 36, 64(%2) \n\t"
7171
"lxvp 38, 96(%2) \n\t"
7272

7373
"stxvp 40, 128(%3) \n\t"
74-
"lxvp 40, 128(%2) \n\t"
7574
"stxvp 42, 160(%3) \n\t"
76-
"lxvp 42, 160(%2) \n\t"
7775
"stxvp 44, 192(%3) \n\t"
78-
"lxvp 44, 192(%2) \n\t"
7976
"stxvp 46, 224(%3) \n\t"
77+
"lxvp 40, 128(%2) \n\t"
78+
"lxvp 42, 160(%2) \n\t"
79+
"lxvp 44, 192(%2) \n\t"
8080
"lxvp 46, 224(%2) \n\t"
8181

8282
"stxvp 48, 256(%3) \n\t"
83-
"lxvp 48, 256(%2) \n\t"
8483
"stxvp 50, 288(%3) \n\t"
85-
"lxvp 50, 288(%2) \n\t"
8684
"stxvp 52, 320(%3) \n\t"
87-
"lxvp 52, 320(%2) \n\t"
8885
"stxvp 54, 352(%3) \n\t"
86+
"lxvp 48, 256(%2) \n\t"
87+
"lxvp 50, 288(%2) \n\t"
88+
"lxvp 52, 320(%2) \n\t"
8989
"lxvp 54, 352(%2) \n\t"
90+
9091
"stxvp 56, 384(%3) \n\t"
91-
"lxvp 56, 384(%2) \n\t"
9292
"stxvp 58, 416(%3) \n\t"
93-
"lxvp 58, 416(%2) \n\t"
9493
"stxvp 60, 448(%3) \n\t"
95-
"lxvp 60, 448(%2) \n\t"
9694
"stxvp 62, 480(%3) \n\t"
95+
"lxvp 56, 384(%2) \n\t"
96+
"lxvp 58, 416(%2) \n\t"
97+
"lxvp 60, 448(%2) \n\t"
9798
"lxvp 62, 480(%2) \n\t"
9899

99100
"addi %3, %3, 512 \n\t"

kernel/power/dcopy_power10.c

+11-5
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
8585

8686
if ( (inc_x == 1) && (inc_y == 1 ))
8787
{
88-
89-
BLASLONG n1 = n & -64;
90-
if ( n1 > 0 )
88+
if ( n >= 64 )
89+
{
90+
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
91+
for (i = 0; i < align; i++) {
92+
y[i] = x[i] ;
93+
}
94+
}
95+
BLASLONG n1 = (n-i) & -64;
96+
if ( n1 )
9197
{
92-
copy_kernel(n1, x, y);
93-
i=n1;
98+
copy_kernel(n1, &x[i], &y[i]);
99+
i += n1;
94100
}
95101

96102
while(i < n)

kernel/power/scopy_power10.c

+11-4
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
8686
if ( (inc_x == 1) && (inc_y == 1 ))
8787
{
8888

89-
BLASLONG n1 = n & -128;
90-
if ( n1 > 0 )
89+
if ( n >= 128 )
9190
{
92-
copy_kernel (n1, x, y);
93-
i=n1;
91+
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
92+
for (i = 0; i < align; i++) {
93+
y[i] = x[i] ;
94+
}
95+
}
96+
BLASLONG n1 = (n-i) & -128;
97+
if ( n1 )
98+
{
99+
copy_kernel(n1, &x[i], &y[i]);
100+
i += n1;
94101
}
95102

96103
while(i < n)

0 commit comments

Comments
 (0)