-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathps_smr.h
237 lines (206 loc) · 7.47 KB
/
ps_smr.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/***
* Copyright 2014-2015 by Gabriel Parmer. All rights reserved.
*
* Redistribution of this file is permitted under the GNU General
* Public License v2.
*
* Authors: Qi Wang, interwq@gmail.com, Gabriel Parmer, gparmer@gwu.edu, 2015
*
* History:
* - Started as parsec.c and parsec.h by Qi.
*/
/***
* A Scalable Memory Reclamation (SMR) technique built off of the slab
* allocator for parsec (parallel sections). Maintains a freelist per
* slab with memory items ordered in terms of the Time Stamp Counter
* (tsc) taken when the node was freed. Removal from these queues is
* governed by quiescence of parallel threads at the time the memory
* was freed (which might be some time in the past). This code
* specifies the policy for when memory flows between the quiescing
* queues, and the slab memory. Moving memory back to the slabs is
* important to enable us to reclaim and migrate memory between cores
* (each slab is owned by a core), thus there is some balancing to be
* done here.
*/
#ifndef PS_SMR_H
#define PS_SMR_H
#include <ps_global.h>
#include <ps_slab.h>
#ifndef PS_QLIST_BATCH
#define PS_QLIST_BATCH 128
#endif
struct ps_quiescence_timing {
volatile ps_tsc_t time_in, time_out;
#ifdef OPTIMAL
volatile ps_tsc_t last_known_quiescence;
char __padding[PS_CACHE_PAD - 3*sizeof(ps_tsc_t)];
#else
char __padding[PS_CACHE_PAD - 2*sizeof(ps_tsc_t)];
#endif
} PS_ALIGNED PS_PACKED;
struct __ps_other_core {
ps_tsc_t time_in, time_out, time_updated;
};
struct ps_smr_percore {
/* ps_quiescence_timing info of this CPU */
struct ps_quiescence_timing timing;
/* ps_tsc_t period, deadline; */
/* #ifdef OPTIMAL */
/* /\* ps_quiescence_timing info of other CPUs known by this CPU *\/ */
/* struct __ps_other_core timing_others[PS_NUMCORES]; */
/* /\* padding an additional cacheline for prefetching *\/ */
/* char __padding[PS_CACHE_PAD - (((sizeof(struct __ps_other_core)*PS_NUMCORES)+sizeof(struct ps_quiescence_timing)+2*sizeof(ps_tsc_t)) % PS_CACHE_LINE)]; */
/* #else */
/* char __padding[PS_CACHE_PAD - (2*sizeof(ps_tsc_t) % PS_CACHE_LINE)]; */
/* #endif */
} PS_ALIGNED PS_PACKED;
struct parsec {
int refcnt;
struct ps_smr_percore timing_info[PS_NUMCORES] PS_ALIGNED;
} PS_ALIGNED;
int ps_quiesce_wait(struct parsec *p, ps_tsc_t tsc, ps_tsc_t *qsc);
int ps_try_quiesce (struct parsec *p, ps_tsc_t tsc, ps_tsc_t *qsc);
void ps_init(struct parsec *ps);
void ps_init_period(struct parsec *ps, ps_tsc_t p);
struct parsec *ps_alloc(void);
void __ps_smr_reclaim(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si, struct ps_mem *mem, ps_free_fn_t ffn);
void __ps_memptr_init(struct ps_mem *m, struct parsec *ps);
int __ps_memptr_delete(struct ps_mem *m);
static inline void
__ps_smr_free(void *buf, struct ps_mem *mem, ps_free_fn_t ffn)
{
struct ps_mheader *m = __ps_mhead_get(buf);
struct ps_smr_info *si;
struct ps_qsc_list *ql;
coreid_t curr_core, curr_numa;
ps_tsc_t tsc;
/* this is 85% of the cost of the function... */
tsc = ps_tsc_locality(&curr_core, &curr_numa);
si = &mem->percore[curr_core].smr_info;
ql = &si->qsc_list;
/*
* Note: we currently enqueue remotely freed memory into the
* qlist of the core the memory is freed on, later to be moved
* to its native core by the remote free logic within the slab
* allocator. This might cause some cache coherency traffic
* that we wouldn't otherwise have due to qlist operations
* (i.e. writing to the ->next field within the header), but
* has the large benefit that we don't have to complicate the
* free-time ordering of memory chunks in the quiescence list.
*/
__ps_mhead_setfree(m, tsc);
__ps_qsc_enqueue(ql, m);
si->qmemcnt++;
if (si->qmemcnt > si->qmemmax) si->qmemmax = si->qmemcnt;
#ifdef OPTIMAL
if (unlikely(si->qmemcnt >= si->qmemtarget)) __ps_smr_reclaim(curr_core, ql, si, mem, ffn);
#endif
}
extern __thread ps_tsc_t _ps_period, _ps_deadline;
static inline void
__ps_quiesce(struct ps_mem *mem, ps_free_fn_t ffn)
{
#ifdef OPTIMAL
return ;
#endif
struct ps_smr_info *si;
struct ps_qsc_list *ql;
struct ps_smr_percore *ti;
coreid_t curr, curr_numa;
ps_tsc_t tsc;
tsc = ps_tsc_locality(&curr, &curr_numa);
si = &mem->percore[curr].smr_info;
ql = &si->qsc_list;
ti = &si->ps->timing_info[curr];
/* if (tsc >= ti->deadline) { */
/* ti->deadline = tsc + ti->period; */
if (tsc >= _ps_deadline) {
_ps_deadline = tsc + _ps_period;
__ps_smr_reclaim(curr, ql, si, mem, ffn);
}
return;
}
static inline void
ps_enter(struct parsec *parsec)
{
coreid_t curr_cpu, curr_numa;
ps_tsc_t curr_time;
struct ps_quiescence_timing *timing;
curr_time = ps_tsc_locality(&curr_cpu, &curr_numa);
timing = &(parsec->timing_info[curr_cpu].timing);
timing->time_in = curr_time;
/*
* The following is needed when we have coarse granularity
* time-stamps (i.e. non-cycle granularity, which means we
* could have same time-stamp for different events).
*/
/* timing->time_out = curr_time - 1; */
ps_mem_fence();
return;
}
static inline void
ps_exit(struct parsec *parsec)
{
int curr_cpu = ps_coreid();
struct ps_quiescence_timing *timing;
timing = &(parsec->timing_info[curr_cpu].timing);
/*
* Here we don't require a full memory barrier on x86 -- only
* a compiler barrier is enough.
*/
ps_cc_barrier();
timing->time_out = timing->time_in + 1;
return;
}
#define __PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, headoff, allocfn, freefn) \
PS_SLAB_CREATE_AFNS(name, objsz, allocsz, headoff, allocfn, freefn) \
static inline void \
__ps_parslab_free_tramp_##name(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t c) \
{ (void)sz; ps_slabptr_free_coreid_##name(m, s, c); } \
static inline void * \
ps_memptr_alloc_##name(struct ps_mem *m) \
{ return ps_slabptr_alloc_##name(m); } \
static inline void * \
ps_mem_alloc_##name(void) \
{ return ps_slab_alloc_##name(); } \
static inline void \
ps_memptr_free_##name(struct ps_mem *m, void *buf) \
{ __ps_smr_free(buf, m, __ps_parslab_free_tramp_##name); } \
static inline void \
ps_mem_free_##name(void *buf) \
{ ps_memptr_free_##name(&__ps_mem_##name, buf); } \
static void \
ps_memptr_init_##name(struct ps_mem *m, struct parsec *ps) \
{ __ps_memptr_init(m, ps); } \
static inline void \
ps_mem_init_##name(struct parsec *ps) \
{ \
ps_slabptr_init_##name(&__ps_mem_##name); \
ps_memptr_init_##name(&__ps_mem_##name, ps); \
} \
static inline struct ps_mem * \
ps_memptr_create_##name(struct parsec *ps) \
{ \
struct ps_mem *m = ps_slabptr_create_##name(); \
if (!m) return NULL; \
ps_memptr_init_##name(m, ps); \
return m; \
} \
static inline int \
ps_memptr_delete_##name(struct ps_mem *m) \
{ \
if (__ps_memptr_delete(m)) return -1; \
ps_slabptr_delete_##name(m); \
return 0; \
} \
static inline int \
ps_mem_delete_##name(void) \
{ return ps_memptr_delete_##name(&__ps_mem_##name); } \
static inline void \
ps_quiesce_##name(void) \
{ __ps_quiesce(&__ps_mem_##name, __ps_parslab_free_tramp_##name); }
#define PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, allocfn, freefn) \
__PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, sizeof(struct ps_slab), allocfn, freefn)
#define PS_PARSLAB_CREATE(name, objsz, allocsz) \
PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, ps_slab_defalloc, ps_slab_deffree)
#endif /* PS_SMR_H */