-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparspl.py
executable file
·1770 lines (1589 loc) · 58.8 KB
/
parspl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# PYTHON_ARGCOMPLETE_OK
import argparse
import os
import subprocess
import argcomplete
import scipy.sparse as spa
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import networkx as nx
import numpy as np
from data_format import (
Triag,
Kernel,
Collist,
Tile,
DiagInv,
Empty,
SynchBuffer,
Mapping,
CscTile,
)
from general import (
escape,
HARTS,
eprint,
wprint,
bprint,
dprint,
GRAY_COLOR,
color_palette,
DEBUG,
ndarrayToCH,
ndarrayToC,
list2array,
list_flatten,
)
import general
np.random.seed(0)
CAT_CMD = "bat"
DEBUG = False
def workload_distribute_L(newLp, newLi, newLx):
dtype_x = np.float64
LpStart = [] # Offset into Lx.
LpLenTmp = [list() for i in range(HARTS)]
LxNewTmp = [list() for i in range(HARTS)]
LiNewTmp = [list() for i in range(HARTS)]
for i in range(len(newLp) - 2):
tmp = int(newLp[i + 1] - newLp[i])
jobs = tmp // HARTS
leftover = tmp % HARTS
# compute how many jobs each core has to do and append the correspoding
# data to LxNew and LiNew.
for core_id in range(HARTS):
tmp = -1
if leftover != 0 and leftover > core_id:
tmp = jobs + 1
else:
tmp = jobs
# LpStart.append( len(LxNew) if tmp > 0 else -1 )
LpLenTmp[core_id].append(tmp)
index = [t * (HARTS) + core_id for t in range(tmp)]
halalu = [] # tmp for blocked Lx data
for l in index:
halalu.append(newLx[l + newLp[i]])
LxNewTmp[core_id].extend(halalu)
tmparr = [newLi[l + newLp[i]] for l in index]
LiNewTmp[core_id].extend(tmparr)
LpLen = []
LxNew = []
LiNew = []
LpLenSum = []
LiNewR = []
for i in range(HARTS):
LpLen.extend(LpLenTmp[i])
LpLenSum.append(sum(LpLenTmp[i]))
LxNew.extend(LxNewTmp[i])
LiNew.extend(LiNewTmp[i])
LiNewTmp[i].reverse()
LiNewR.extend(LiNewTmp[i])
LpStart = [0]
for i in range(HARTS):
LpStart.append(LpLenSum[i] + LpStart[i])
print(
f"inflating Lp to LpLen increasing size from {len(newLp)} to {len(LpStart)+len(LpLen)}."
)
print(f"Average length in LpLen is {sum(LpLen)/len(LpLen)}")
# convert to numpy arrays
LpStart = list2array(LpStart, "LpStart", base=32)
LpLenSum = list2array(LpLenSum, "LpLenSum", base=32)
LpLen = list2array(LpLen, "LpLen", base=8)
LiNew = list2array(LiNew, "LiNew", base=16)
LiNewR = list2array(LiNewR, "LiNewR", base=16)
LxNew = np.array(LxNew, dtype=dtype_x)
return (LpStart, LpLen, LpLenSum, LxNew, LiNew, LiNewR)
class SubBlock:
"""Iterator class for subparts of the L matrix."""
def __init__(self, Lp, Li, Lx, start, stop):
assert stop > start
assert start >= 0
assert len(Lp) > stop
self.start = start
self.stop = stop
self.Lx = Lx
self.Li = Li
self.Lp = Lp
self.cc = start # current column
def __iter__(self):
return self
def __next__(self):
if not (self.stop > self.cc):
raise StopIteration
start = self.Lp[self.cc]
end = self.Lp[self.cc + 1]
self.cc += 1
def filter_fun(rowx):
row, _ = rowx
return row < self.stop
coliter = filter(filter_fun, zip(self.Li[start:end], self.Lx[start:end]))
return self.cc - 1, coliter
def sub_nnz(subiter):
assert type(subiter) is SubBlock
assert subiter.cc == subiter.start
nnz = 0
for col, coliter in subiter:
nnz += sum(1 for _ in coliter)
subiter.cc = subiter.start # reset iterator (otherwise we would use it up!)
return nnz
def sub_dense_size(subiter):
dim = subiter.stop - subiter.start
return dim * (dim - 1) / 2
def sub_dens(subiter):
"""Compute density of triagonal block in L."""
density = sub_nnz(subiter) / sub_dense_size(subiter)
return density
def find_optimal_cuts(L, start, stop, depth=0, band=None, minsize=15):
DENSITY_THRESHOLD = 0.95
# 80% fillin is okay if the triangle is at most 1/6 of the
# The fundamental idea here is the smaller the triangle the less we care about the density
# So the density per size threshold must be sufficient.
DENSITY_SIZE_THRESHOLD = 0.8 / (1 / 12)
Lp, Li, Lx, n = L.Lp, L.Li, L.Lx, L.n
if band is None:
band = (stop - start) / 3
if not (stop > start + 1 + minsize):
print(f"FOC: stopping at minsize {minsize} for {start}-{stop}")
return []
if depth == 10:
wprint(f"FOC: stopping at max recursion depth for {start}-{stop}")
return []
subiter = SubBlock(Lp, Li, Lx, start, stop)
density = sub_dens(subiter)
relsize = (stop - start) / L.n
if density > DENSITY_THRESHOLD:
print(
f"FOC: stopping at {start}-{stop} for absolute density threshold {DENSITY_THRESHOLD}"
)
return []
if density / relsize > DENSITY_SIZE_THRESHOLD:
print(
f"FOC: stopping at {start}-{stop} for density per relsize threshold {density/relsize}, threshold is {DENSITY_SIZE_THRESHOLD}"
)
return []
# define weigh function based on distance between two nodes
def weight(a, b):
assert b > a
# ensure
if band is not None and b - a >= band:
return 0
return 1.0 / (-(b - a)) ** 2
# create directed, weighted graph
def subiter2weighted_graph(subiter, source, sink):
G = nx.DiGraph()
G.add_nodes_from(range(sink, source + 1))
edges = []
for col, coliter in subiter:
for row, val in coliter:
w = weight(col, row)
if w != 0:
edges.append((row, col, w))
G.add_weighted_edges_from(edges, weight="capacity")
return G
# create weighted graph
source, sink = stop - 1, start
G = subiter2weighted_graph(subiter, source, sink)
# calculate min-cut
cut_value, partition = nx.minimum_cut(
G, source, sink, flow_func=nx.algorithms.flow.edmonds_karp
)
# print(f'- depth {depth}, start {start}, stop {stop}')
# print(partition)
# extract cuts
cuts = set()
part0, part1 = list(partition[0]), list(partition[1])
part0.sort()
part1.sort()
last = None
for part in [part0, part1]:
if part[0] != start:
cuts.add(part[0])
for i, p in enumerate(part[1:]):
if part[i] + 1 != p:
cuts.add(p)
cutstmp = sorted(list(cuts))
# from all the cuts choose the one closest to the middleo
diffmid = [abs(c - start - (stop - start) / 2) for c in cutstmp]
cut = cutstmp[np.argmin(diffmid)]
print(
f"FOC: from min-cut-set {list(cutstmp)} selecting cut {cut} inbetween {start}-{stop}"
)
# check if cut is worth it. We devide smat into 3 parts:
## ---------------
## | lup | |
## ---------------
## | rect | lsub |
## ---------------
smat = SubBlock(Lp, Li, Lx, start, stop)
smatnnz = sub_nnz(smat)
smatfull = sub_dense_size(smat)
iterlup = SubBlock(Lp, Li, Lx, start, cut)
lupnnz = sub_nnz(iterlup)
lupfull = sub_dense_size(iterlup)
itersub = SubBlock(Lp, Li, Lx, cut, stop)
lsubnnz = sub_nnz(itersub)
lsubfull = sub_dense_size(itersub)
rectnnz = smatnnz - lupnnz - lsubnnz
rectfull = (cut - start) * (stop - cut)
continue_cutting = False
# EITHER cut removes at least 5% of nnz() in smat
if (rectfull - rectnnz) / smatfull > 0.05:
continue_cutting = True
# OR rect and (lsub or lup) is empty
if rectnnz == 0 and (lupnnz == 0 or lsubnnz == 0):
continue_cutting = True
# UNLESS
if not continue_cutting:
wprint(
f"FOC: stopping to dissect {start}-{stop} because of useless cut at {cut} even though DENSITY_SIZE termination criterion is not reached."
)
return []
# TODO: if an DAG graph based cut does not help AND we still miss the DENSITY_SIZE threshold
# consider using a binary search
# or just cut in the middle (or better maybee top 1/3)
# recursively cut apart upper and lower
cutsu = find_optimal_cuts(L, start, cut, depth=depth + 1)
cutsl = find_optimal_cuts(L, cut, stop, depth=depth + 1)
# merge all cuts
cuts = [*cutsu, cut, *cutsl]
return cuts
def verify_cuts(cuts, n):
cuts.sort()
assert cuts[0] == 0
assert cuts[-1] == n
for i in range(len(cuts) - 1):
assert cuts[i] < cuts[i + 1]
def tile_L(L, cuts):
"""Cut appart L and return matrix representation of tiles."""
# verify data
n = L.n
verify_cuts(cuts, n)
numcuts = len(cuts) - 1
# tiles do not necessarilly have to be ordered
tiles = [[None for j in range(i + 1)] for i in range(numcuts)]
def tile_from_index(row, col):
"""Get index of tile based on index of matrix."""
r = 0
c = 0
while row >= cuts[r + 1]:
r += 1
while col >= cuts[c + 1]:
c += 1
# print(f'row,col {row},{col} are found in tile {r},{c}')
return tiles[r][c]
# compute cuts
for i in range(numcuts):
for j in range(i + 1):
tiles[i][j] = Collist(cuts[i], cuts[i + 1] - 1, cuts[j], cuts[j + 1] - 1)
# distribute data
for col in range(n):
for i in range(L.Lp[col], L.Lp[col + 1]):
row = L.Li[i]
tile = tile_from_index(row, col)
tile.insert(row, col, L.Lx[i])
return tiles
def assign_kernel_to_tile(tiles):
print()
bprint("\nAssigning Kernels to tiles:")
numcuts = len(tiles)
def collist_is_mapping(cl):
assert isinstance(cl, Collist)
rows = set() # set keeping track which rows are allready occupied
for col, (Li, Lx) in cl.items():
if len(Li) > 1: # check each column has only one element
return False
for el in Li:
if el in rows:
return False
# add all rows that are present in curent col to rows set
rows.update(Li)
return True
# decide on kernel for diagonal tiles
for i in range(numcuts):
triag = tiles[i][i]
assert isinstance(triag, Tile)
if triag.nnz() == 0:
tiles[i][i] = Empty(0, 0, 0, 0)
elif collist_is_mapping(triag):
print(f"MAPPING: {triag}")
raise NotImplementedError()
tiles[i][i] = Mapping(triag)
elif triag.density() > 0.8:
print(f"DENSIFY: {triag}")
tiles[i][i] = DiagInv(triag)
# TODO: make non_dense, non_empty diag kernels a thing
# elif triag.density() < 0.05:
# print(f"SPARSIFY: {triag}")
# raise NotImplementedError()
else:
eprint(
f'Triag "{triag}" is neither sparse nor dense ({triag.density()*100:.1f}%). Inflating memory by inverting. Consider subcutting it.'
)
print(f"DENSIFY (ANYWAYS): {triag}")
tiles[i][i] = DiagInv(triag)
# raise NotImplementedError()
# decide on kernel for the rest
# only assign mappings to rows where all non-diag tiles are mappings as well:
# this is necessary for functional correctness (at least currently)
MIN_ROW_SPAN_TO_AMORTIZE_MAPPINGS = 5
all_mappings = {}
for i in range(1, numcuts):
mappings = []
for j in range(i):
rect = tiles[i][j]
assert isinstance(rect, Collist)
if rect.empty():
continue
is_mapping = collist_is_mapping(rect)
is_worth_it = (rect.rowz - rect.rowa) > MIN_ROW_SPAN_TO_AMORTIZE_MAPPINGS
if is_mapping and is_worth_it:
mappings.append(j)
else:
if is_mapping and not is_worth_it:
print(
f"Ignoring Mapping {rect} due to {MIN_ROW_SPAN_TO_AMORTIZE_MAPPINGS=}."
)
# revert mappings list
if len(mappings) != 0:
bprint(f"Ignoring Mapping in {mappings} due to {rect}.")
mappings = []
break
for j in mappings:
rect = tiles[i][j]
print(f"MAPPING: {rect}")
tiles[i][j] = Mapping(rect)
if DEBUG:
# DEBUG print tiles
for til in tiles:
for t in til:
print(f"{t}: {t.nnz()} nnz")
def optimize_tiles(tiles, n, args):
"""
Optimize and Merge tiles.
Do dependency tree based scheduling of tiles.
Remove empty dependencies.
Merge independent tiles on same optimization level.
Parameters:
n: dimension of matrix
tiles (list of lists): matrix of tiles
Returns:
tiles_list (list): unordered, unstructured list of lists of tiles
each list corresponds to a synchronization level
"""
# merge diagonal elements
# TODO: DAG scheduling of subblocks
numcuts = len(tiles)
# merge all Collist below each other
# this basically ensures that collists are getting merged and scheduled ASAP
# when they are beneave each other
# otherwise we want them scheduled ALAP
# remove emtpy tiles
for c in range(numcuts - 1):
tilecol = []
for r in range(c + 1, numcuts):
t = tiles[r][c]
if t.empty():
tiles[r][c] = None
continue
if isinstance(t, Collist):
tilecol.append((t, r, c))
if len(tilecol) == 0:
continue
for t, r, c in tilecol[1:]:
dprint(f"merge {t} into {tilecol[0][0]}")
tilecol[0][0].merge(t)
tiles[r][c] = None
# tighten bounds on collist tiles
# -> reveals acctual dependencies
for til in tiles:
for t in til:
fun = getattr(t, "tighten_bound", None)
if callable(fun):
t.tighten_bound()
# create tile_list by extracting tiles in order
tile_list = []
for til in tiles:
for t in til:
if t is not None and t.nnz() > 0:
tile_list.append([t])
# optimize using As Late As Possible Scheduling
if args.alap:
tile_list = list_flatten(tile_list)
# create dependency tree
bprint("Building and optimizing dependency tree")
dprint("==== Tiles ====")
for t in tile_list:
dprint(t)
class Node:
def __init__(self, tile):
self.tile = tile
self.child = set()
self.parent = set()
self.level = 0
def add_dependency(self, other):
if self is other:
return
if self.is_child_of(other):
self.parent.add(other)
if self.is_parent_of(other):
self.child.add(other)
def is_parent_of(self, other):
if self.tile.rowa > other.tile.colz:
return False
if self.tile.rowz < other.tile.cola:
return False
return True
def is_child_of(self, other):
return other.is_parent_of(self)
def __repr__(self):
ch = ""
ph = ""
for c in self.child:
ch += str(c.tile) + ", "
for p in self.parent:
ph += str(p.tile) + ", "
return f"Node{self.tile} {self.level} -> child:[{ch}] parent:[{ph}]"
# return f'Node{self.tile} {self.level} '
tree = [Node(t) for t in tile_list]
# build dependency tree
for s in tree:
for o in tree:
s.add_dependency(o)
dprint("==== Tree ====")
for t in tree:
dprint(t)
def reset_level(tree):
for t in tree:
t.level = 0
# show levels of DAG tree
def asap_level(tree):
def ASAP_level(node):
maxlevel = node.level
for c in node.child:
c.level = max(node.level + 1, c.level)
maxlevel = max(maxlevel, ASAP_level(c))
return maxlevel
reset_level(tree)
maxlevel = 0
for root in tree:
if len(root.parent) == 0:
newmax = ASAP_level(root)
maxlevel = max(maxlevel, newmax)
level_list = [[] for i in range(maxlevel + 1)]
for node in tree:
level_list[node.level].append(node)
return level_list
def alap_level(tree):
def ALAP_level(node):
minlevel = node.level
for p in node.parent:
p.level = min(node.level - 1, p.level)
minlevel = min(minlevel, ALAP_level(p))
return minlevel
reset_level(tree)
minlevel = 0
for leave in tree:
if len(leave.child) == 0:
minlevel = min(minlevel, ALAP_level(leave))
level_list = [[] for i in range(-minlevel + 1)]
for node in tree:
level_list[-minlevel + node.level].append(node)
return level_list
def optimize_level(levels):
for i, l in enumerate(levels):
cl = None
tmp = []
for node in l:
if isinstance(node.tile, Collist):
if cl is None:
cl = node
continue
else: # merge
dprint(
f"Merging on same DAG level: \n\t\t{cl}\n\t\t {node}"
)
cl.child.update(node.child)
cl.parent.update(node.parent)
cl.tile.merge(node.tile)
continue
tmp.append(node)
if cl is not None:
tmp.append(cl)
levels[i] = tmp
return levels
def print_levels(levels):
for i, l in enumerate(levels):
print(f"level {i:2}:")
for j, el in enumerate(l):
print(f'{" "*5}| ', el)
if DEBUG:
print("\n\n ==== ASAP DAG tree levels ====")
levels = asap_level(tree)
assert len(tile_list) == len(list_flatten(levels))
print_levels(levels)
print("\n\n ==== ALAP DAG tree levels ====")
levels = alap_level(tree)
assert len(tile_list) == len(list_flatten(levels))
print_levels(levels)
print("\n\n ==== optimized DAG tree levels ====")
levels = alap_level(tree)
assert len(tile_list) == len(list_flatten(levels))
levels = optimize_level(levels)
print_levels(levels)
tile_list = []
for l in levels:
for node in l:
tile_list.append([node.tile])
# determining firstbp/lastbp to remove non-used reduction range
firstbp = n
lastbp = 0 # exclusive
for tl in tile_list:
for t in tl:
if t.REDUCES:
firstbp = min(firstbp, t.rowa)
lastbp = max(lastbp, t.rowz + 1)
# assign reduction range from [self.rowa to self.reduce_stop)
stop = lastbp
for t in reversed(tile_list):
assert len(t) == 1
t = t[0]
if t.REDUCES:
t.reduce_stop = stop
if stop <= t.rowa:
wprint(f"{t} should be mergable into another Collist.")
stop = min(t.rowa, stop)
return tile_list, firstbp, lastbp
def schedule_to_workers(tile_list):
"""Scheduling Triangles and Rectangles onto processor cores.
Parameters:
tile_list (list(list)): A list of synchronization steps. Each step can contain multiple tiles in a list, to represent multiple parallelizable workloads.
Returns:
schedule_fe (list(tuple)): A list of synchronization steps. Each step is a tuple of WORKER elements. Each element defines the work to do for that specific processing core.
schedule_bs (list(tuple)): Same but for the Backward Substitution.
"""
schedule_fe, schedule_bs = [], []
for work in tile_list:
if len(work) > 1:
raise NotImplementedError(
"Multiple tiles, so inter-kernel workload balancing, is unimplemented"
)
tile = work[0]
dprint(f"Scheduling {tile}")
if tile.REDUCES:
tile.schedule_reduction()
# distribute work while balancing load
dist = tile.schedule()
# add dummy synch steps if some kernels synch interanlly as well
# FE
for i in range(len(dist)):
if dist[i].assigned_data() == 0:
if not isinstance(dist[i], Collist):
dist[i] = Empty(0, 0, 0, 0)
schedule_fe.append(tuple(dist))
maxsnum = max([d.snum_fe() for d in dist])
for i in range(maxsnum):
buffer_dist = []
for h in range(HARTS):
if dist[h].snum_fe() > i:
buffer_dist.append(SynchBuffer(0, 0, 0, 0))
else:
buffer_dist.append(Empty(0, 0, 0, 0))
schedule_fe.append(tuple(buffer_dist))
# BS
# in FE we want to keep emtpy collist items: they serve of value
# in BS we do not need these.
# purge dist from empty items:
for i in range(len(dist)):
if dist[i].assigned_data() == 0:
dist[i] = Empty(0, 0, 0, 0)
maxsnum = max([d.snum_bs() for d in dist])
for i in range(maxsnum):
buffer_dist = []
for h in range(HARTS):
if dist[h].snum_fe() > i:
buffer_dist.append(SynchBuffer(0, 0, 0, 0))
else:
buffer_dist.append(Empty(0, 0, 0, 0))
schedule_bs.append(tuple(buffer_dist))
# add in reverse order because we reverse later
schedule_bs.append(tuple(dist))
# reverse
schedule_bs.reverse()
return schedule_fe, schedule_bs
def print_schedule(schedule, s_offset=0):
# print('\n########## SCHEDULING ##########')
for synch, step in enumerate(schedule):
print(f"synch. step {synch+s_offset}:")
for hart, work in enumerate(step):
if work.assigned_data() == 0:
print(f" H{hart} {work}: ")
else:
print(f" H{hart} {work}:\t {work.assigned_data()} assigned elements")
print()
def codegenSolver(args, schedule_fe, schedule_bs, bp_sync):
synchsteps_fe = len(schedule_fe)
synchsteps_bs = len(schedule_bs)
direc = f"{args.wd}/build/{args.test}"
if not os.path.exists(direc):
os.makedirs(direc)
datafile = f"{direc}/scheduled_data.h"
bprint(f"\nCode generation into {datafile}.")
print(f"Synchronizing write access to bp at: {bp_sync}")
# define space for data that defines what kernel to call, what arguments to pass and
# what data to include
enum = [[] for i in range(HARTS + 1)] # defines function call to kernel
argstruct = [
[] for i in range(HARTS + 1)
] # defines passed argument in function call
codedata = {} # defines static data
def generate_enum_list(for_fe=True):
for s, dist in enumerate(schedule_fe if for_fe else schedule_bs):
assert len(dist) == HARTS
for h, d in enumerate(dist):
# call tiles codegen
(kfe, kbs), (args_fe, args_bs), dat = d.codegen(s, h)
if for_fe:
kernel, args = kfe, args_fe
else:
kernel, args = kbs, args_bs
if kernel is None:
continue
# process data
codedata.update(dat)
# process function call
enum[h].append(kernel.name)
# process function arguments
if args is not None:
argstruct[h].append("&" + args)
# all other harts simply synchronize
enum[-1].append(Kernel.SYNCH.name)
# generate data according to FE schedule
if args.case == "solve" or args.case == "lsolve":
print("generating schedule for FE")
generate_enum_list(for_fe=True)
# process diag_inv_mult kernel
if args.case == "solve":
print("generating schedule for inverse diagonal multiplication")
for h in range(HARTS):
enum[h].append(Kernel.DIAG_INV_MULT.name)
enum[-1].append(Kernel.SYNCH.name)
# process BS
if args.case == "solve" or args.case == "ltsolve":
print("generating schedule for BS")
generate_enum_list(for_fe=False)
# dump data fo file
with open(datafile, "w") as f:
# enumerate definition for Kernel
enumdef = "enum Kernel {"
for k in Kernel:
enumdef += f"{k.name} = {k.value}, "
enumdef += "};\n\n"
f.write(enumdef)
# dump static data
for k, v in codedata.items():
if isinstance(v, list):
v = list2array(v, k)
ndarrayToC(f, k, v, const=True)
elif isinstance(v, np.ndarray):
ndarrayToC(f, k, v, const=True)
elif isinstance(v, str):
v = v.split("=")
assert len(v) == 2
attr = f'__attribute__((aligned(8),section(".tcdm")))'
f.write(f"const {v[0]} {attr} ={v[1]}\n")
else:
raise NotImplementedError(f"Unknown how to convert {type(v)} to code")
# merge argument data
args_coff = []
argstruct_joined = []
for v in argstruct:
args_coff.append(len(argstruct_joined))
argstruct_joined.extend(v)
args_coff.append(len(argstruct_joined))
# dump argument data
name = "argstruct_coreoffset"
# TODO: select base for argstruct_joined: currently is uint8_t
ndarrayToC(f, name, list2array(args_coff, name), const=True)
attr = f'__attribute__((aligned(4),section(".tcdm")))'
f.write(f"void * const argstruct_joined [] {attr} = " + "{\n")
for h, l in enumerate(argstruct):
f.write(f"// HART {h}\n")
for s in l:
f.write(f"(void *) {s},\n")
f.write("};\n\n")
# merge enum data
enum_coff = []
enum_joined = []
for v in enum:
enum_coff.append(len(enum_joined))
enum_joined.extend(v)
enum_coff.append(len(enum_joined))
# dump enum data
name = "enum_coreoffset"
# TODO: select base for enum_coreoffset: currently is uint8_t
ndarrayToC(f, name, list2array(enum_coff, name), const=True)
attr = f'__attribute__((aligned(4),section(".tcdm")))'
f.write(f"const enum Kernel enum_joined [] {attr} = " + "{\n")
for h, l in enumerate(enum):
f.write(f"// HART {h}\n")
for s in l:
f.write(f"{s},\n")
f.write("};\n\n")
def writeWorkspaceToFile(args, linsys, firstbp=0, lastbp=None, permutation=None):
if lastbp is None:
lastbp = linsys.n
global Kdata
global Ldata
global x_gold
direc = f"{args.wd}/build/{args.test}"
if not os.path.exists(direc):
os.makedirs(direc)
workh = f"{direc}/workspace.h"
workc = f"{direc}/workspace.c"
goldenh = f"{direc}/olden.h"
bprint(f"\nCreating workspace")
case = "solve"
if args.lsolve:
case = "lsolve"
elif args.ltsolve:
case = "ltsolve"
print(
f"Dumping to {workh} and {workc}.\nIncluding golden model for **{args.case}**"
)
try:
fh = open(workh, "w")
fc = open(workc, "w")
# includes and defines
if args.sssr:
fh.write("#define SSSR \n")
if args.parspl:
fh.write("#define PARSPL \n")
fh.write("#define PERMUTATE \n")
fh.write("#define FIRST_BP ({firstbp})\n")
fh.write("#define LAST_BP ({lastbp})\n")
elif args.solve_csc:
fh.write("#define SOLVE_CSC \n")
permutation = None
elif args.psolve_csc:
fh.write("#define PSOLVE_CSC \n")
permutation = None
elif args.sssr_psolve_csc:
fh.write("#define SSSR_PSOLVE_CSC \n")
permutation = None
fc.write('#include "workspace.h"\n')
fh.write("#include <stdint.h>\n")
fh.write('#include "types.h"\n\n')
fh.write(f"#define {args.case.upper()}\n")
fh.write(f"#define LINSYS_N ({linsys.n})\n")
# create golden model: M @ x_golden = bp
# Determine M matrix depending on the verification case
if args.ltsolve:
M = Ldata.transpose()
elif args.lsolve:
M = Ldata
else:
# TODO: use K directly to avoid any conversion errors
# but since we still have LDLT as input data
# and do not do the decomposition ourself it is safer to do this instead
# M = spa.csc_matrix((linsys.Kx,linsys.Ki,linsys.Kp),shape=(linsys.n,linsys.n))
M = Kdata
if args.debug and linsys.n < 20:
print("Matrix for golden model creation:")
print(M.toarray())
fc.write(f"// verification of {args.case}\n")
# b
b = M @ x_gold
ndarrayToCH(fc, fh, "b", b)
# golden
ndarrayToCH(fc, fh, "XGOLD", x_gold, section="")
ndarrayToCH(fc, fh, "XGOLD_INV", 1 / x_gold, section="")
# CSC format data
if args.solve_csc or args.psolve_csc:
Lp = list2array(linsys.Lp, "Lp", base=32)
Li = list2array(linsys.Li, "Li", base=16)
Lx = np.array(linsys.Lx, dtype=np.float64)
ndarrayToCH(fc, fh, "Lp", Lp)
ndarrayToCH(fc, fh, "Li", Li)
ndarrayToCH(fc, fh, "Lx", Lx)
elif args.sssr_psolve_csc:
lldd = workload_distribute_L(linsys.Lp, linsys.Li, linsys.Lx)
(LpStart, LpLen, LpLenSum, LxNew, LiNew, LiNewR) = lldd
ndarrayToCH(fc, fh, "LpStart", LpStart)
ndarrayToCH(fc, fh, "LpLen", LpLen)
ndarrayToCH(fc, fh, "LpLenSum", LpLenSum)
ndarrayToCH(fc, fh, "LxNew", LxNew)
ndarrayToCH(fc, fh, "LiNew", LiNew)
ndarrayToCH(fc, fh, "LiNewR", LiNewR)
Dinv = 1 / np.array(linsys.D)
# Perm
if args.sssr:
# originally only used for permutation but now also used for scheduling
# any vector operation (of length total) with SSSRs.
def lenstart_schedule(total, name, lenmo=True):
len_perm = []
start_perm = []
for h in range(HARTS):
l = (HARTS - 1 + total - h) // HARTS
if lenmo: # if length minus one
start_perm.append(sum(len_perm) + h)
len_perm.append(l - 1) # frep/sssr length uses length - 1
else:
start_perm.append(sum(len_perm))
len_perm.append(l)
ndarrayToCH(
fc,
fh,
f"start_{name}",
list2array(start_perm, f"start_{name}", base=32),
)
ndarrayToCH(
fc, fh, f"len_{name}", list2array(len_perm, f"len_{name}", base=32)
)
lenstart_schedule(linsys.n, "perm")
if permutation is not None:
perm = np.array(permutation)
ndarrayToCH(fc, fh, "Perm", perm)
ndarrayToCH(fc, fh, "PermT", np.argsort(perm))
# bp, bp_copy
RANGE = 5
exponent = np.random.random_sample(linsys.n) * (2 * RANGE) - RANGE
bp = np.exp(exponent)
# bp = b[permutation]
ndarrayToCH(fc, fh, "bp", bp)
# Dinv
# permT = np.argsort(perm)
Dinv = Dinv[perm]
ndarrayToCH(fc, fh, "Dinv", Dinv)
else:
# Dinv
ndarrayToCH(fc, fh, "Dinv", Dinv)
# solution vector x
# TODO: setting to random is only used for verification / testing
RANGE = 5
exponent = np.random.random_sample(linsys.n) * (2 * RANGE) - RANGE
x = np.exp(exponent)
ndarrayToCH(fc, fh, "x", x)
exponent = np.random.random_sample(linsys.n) * (2 * RANGE) - RANGE
bp_cp = np.exp(exponent)
ndarrayToCH(fc, fh, "bp_cp", bp_cp)
# temporary space for intermediate results before reduction
bp_tmp = np.zeros(HARTS * (lastbp - firstbp))
ndarrayToCH(fc, fh, "bp_tmp", bp_tmp)
finally:
fh.close()
fc.close()
def interactive_plot_schedule(L, schedule, cuts):
(fig, ax, sq_dict) = L.plot(uselx=False)
# legend
elems = []
for h, c in zip(range(HARTS), color_palette):
patch = Patch(facecolor=c, edgecolor=None, label=f"hart {h}")
elems.append(patch)
ax.legend(handles=elems, loc="upper right")
# Fade color
def fade_all(sq_dict):
grey = (0.7, 0.7, 0.7)
for r in sq_dict:
for v in sq_dict[r].values():
v.set_color(grey)
# Color according to schedule, run loop interactively
fade_all(sq_dict)
plt.pause(0.001)
patchlist = []
prompt = escape.BOLD + f"Select Schedule from 0..{len(schedule)-1}: "