Skip to content

Commit 61acfb0

Browse files
authored
[BOLT] Add pre-aggregated trace support (#127125)
Traces are triplets of branch source, target, and fall-through end (next branch). Traces simplify differentiation of fall-throughs into local- and external-origin, which improves performance over profile with undifferentiated fall-throughs by eliminating profile discontinuity in call to continuation fall-throughs. This makes it possible to avoid converting return profile into call to continuation profile which may introduce statistical biases. The existing format makes provisions for local- (F) and external- (f) origin fall-throughs, but the profile producer needs to know function boundaries. BOLT has that information readily available, so providing the origin branch of a fall-through is a functional replacement of the fall-through kind (f or F). This also has an effect of combining branches and fall-throughs into a single record. As traces subsume other pre-aggregated profile kinds, BOLT may drop support for them soon. Users of pre-aggregated profile format are advised to migrate to the trace format. Test Plan: Updated callcont-fallthru.s
1 parent 050933b commit 61acfb0

File tree

4 files changed

+91
-47
lines changed

4 files changed

+91
-47
lines changed

bolt/include/bolt/Profile/DataAggregator.h

+12-6
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class DataAggregator : public DataReader {
9494

9595
/// Used for parsing specific pre-aggregated input files.
9696
struct AggregatedLBREntry {
97-
enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
97+
enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE };
9898
Location From;
9999
Location To;
100100
uint64_t Count;
@@ -197,6 +197,10 @@ class DataAggregator : public DataReader {
197197

198198
BoltAddressTranslation *BAT{nullptr};
199199

200+
/// Whether pre-aggregated profile needs to convert branch profile into call
201+
/// to continuation fallthrough profile.
202+
bool NeedsConvertRetProfileToCallCont{false};
203+
200204
/// Update function execution profile with a recorded trace.
201205
/// A trace is region of code executed between two LBR entries supplied in
202206
/// execution order.
@@ -268,8 +272,7 @@ class DataAggregator : public DataReader {
268272
uint64_t Mispreds);
269273

270274
/// Register a \p Branch.
271-
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
272-
bool IsPreagg);
275+
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
273276

274277
/// Register a trace between two LBR entries supplied in execution order.
275278
bool doTrace(const LBREntry &First, const LBREntry &Second,
@@ -298,7 +301,7 @@ class DataAggregator : public DataReader {
298301
ErrorOr<PerfMemSample> parseMemSample();
299302

300303
/// Parse pre-aggregated LBR samples created by an external tool
301-
ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
304+
std::error_code parseAggregatedLBREntry();
302305

303306
/// Parse either buildid:offset or just offset, representing a location in the
304307
/// binary. Used exclusively for pre-aggregated LBR samples.
@@ -384,14 +387,15 @@ class DataAggregator : public DataReader {
384387
/// memory.
385388
///
386389
/// File format syntax:
387-
/// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
388-
/// [<mispred_count>]
390+
/// {B|F|f|T} [<start_id>:]<start_offset> [<end_id>:]<end_offset> [<ft_end>]
391+
/// <count> [<mispred_count>]
389392
///
390393
/// B - indicates an aggregated branch
391394
/// F - an aggregated fall-through
392395
/// f - an aggregated fall-through with external origin - used to disambiguate
393396
/// between a return hitting a basic block head and a regular internal
394397
/// jump to the block
398+
/// T - an aggregated trace: branch with a fall-through (from, to, ft_end)
395399
///
396400
/// <start_id> - build id of the object containing the start address. We can
397401
/// skip it for the main binary and use "X" for an unknown object. This will
@@ -402,6 +406,8 @@ class DataAggregator : public DataReader {
402406
///
403407
/// <end_id>, <end_offset> - same for the end address.
404408
///
409+
/// <ft_end> - same for the fallthrough_end address.
410+
///
405411
/// <count> - total aggregated count of the branch or a fall-through.
406412
///
407413
/// <mispred_count> - the number of times the branch was mispredicted.

bolt/lib/Profile/DataAggregator.cpp

+45-21
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
711711
}
712712

713713
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
714-
uint64_t Mispreds, bool IsPreagg) {
714+
uint64_t Mispreds) {
715715
// Returns whether \p Offset in \p Func contains a return instruction.
716716
auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
717717
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
@@ -772,7 +772,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
772772
return false;
773773

774774
// Record call to continuation trace.
775-
if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) {
775+
if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc &&
776+
(IsReturn || IsCallCont)) {
776777
LBREntry First{ToOrig - 1, ToOrig - 1, false};
777778
LBREntry Second{ToOrig, ToOrig, false};
778779
return doTrace(First, Second, Count);
@@ -1216,23 +1217,30 @@ ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
12161217
return Location(true, BuildID.get(), Offset.get());
12171218
}
12181219

1219-
ErrorOr<DataAggregator::AggregatedLBREntry>
1220-
DataAggregator::parseAggregatedLBREntry() {
1220+
std::error_code DataAggregator::parseAggregatedLBREntry() {
12211221
while (checkAndConsumeFS()) {
12221222
}
12231223

12241224
ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
12251225
if (std::error_code EC = TypeOrErr.getError())
12261226
return EC;
1227+
// Pre-aggregated profile with branches and fallthroughs needs to convert
1228+
// return profile into call to continuation fall-through.
12271229
auto Type = AggregatedLBREntry::BRANCH;
12281230
if (TypeOrErr.get() == "B") {
1231+
NeedsConvertRetProfileToCallCont = true;
12291232
Type = AggregatedLBREntry::BRANCH;
12301233
} else if (TypeOrErr.get() == "F") {
1234+
NeedsConvertRetProfileToCallCont = true;
12311235
Type = AggregatedLBREntry::FT;
12321236
} else if (TypeOrErr.get() == "f") {
1237+
NeedsConvertRetProfileToCallCont = true;
12331238
Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
1239+
} else if (TypeOrErr.get() == "T") {
1240+
// Trace is expanded into B and [Ff]
1241+
Type = AggregatedLBREntry::TRACE;
12341242
} else {
1235-
reportError("expected B, F or f");
1243+
reportError("expected T, B, F or f");
12361244
return make_error_code(llvm::errc::io_error);
12371245
}
12381246

@@ -1248,6 +1256,15 @@ DataAggregator::parseAggregatedLBREntry() {
12481256
if (std::error_code EC = To.getError())
12491257
return EC;
12501258

1259+
ErrorOr<Location> TraceFtEnd = std::error_code();
1260+
if (Type == AggregatedLBREntry::TRACE) {
1261+
while (checkAndConsumeFS()) {
1262+
}
1263+
TraceFtEnd = parseLocationOrOffset();
1264+
if (std::error_code EC = TraceFtEnd.getError())
1265+
return EC;
1266+
}
1267+
12511268
while (checkAndConsumeFS()) {
12521269
}
12531270
ErrorOr<int64_t> Frequency =
@@ -1270,9 +1287,24 @@ DataAggregator::parseAggregatedLBREntry() {
12701287
return make_error_code(llvm::errc::io_error);
12711288
}
12721289

1273-
return AggregatedLBREntry{From.get(), To.get(),
1274-
static_cast<uint64_t>(Frequency.get()), Mispreds,
1275-
Type};
1290+
BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From->Offset);
1291+
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To->Offset);
1292+
1293+
for (BinaryFunction *BF : {FromFunc, ToFunc})
1294+
if (BF)
1295+
BF->setHasProfileAvailable();
1296+
1297+
uint64_t Count = static_cast<uint64_t>(Frequency.get());
1298+
AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type};
1299+
AggregatedLBRs.emplace_back(Entry);
1300+
if (Type == AggregatedLBREntry::TRACE) {
1301+
auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT
1302+
: AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
1303+
AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType};
1304+
AggregatedLBRs.emplace_back(TraceFt);
1305+
}
1306+
1307+
return std::error_code();
12761308
}
12771309

12781310
bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
@@ -1585,8 +1617,7 @@ void DataAggregator::processBranchEvents() {
15851617
for (const auto &AggrLBR : BranchLBRs) {
15861618
const Trace &Loc = AggrLBR.first;
15871619
const TakenBranchInfo &Info = AggrLBR.second;
1588-
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount,
1589-
/*IsPreagg*/ false);
1620+
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
15901621
}
15911622
}
15921623

@@ -1722,18 +1753,10 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
17221753
outs() << "PERF2BOLT: parsing pre-aggregated profile...\n";
17231754
NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events",
17241755
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
1725-
while (hasData()) {
1726-
ErrorOr<AggregatedLBREntry> AggrEntry = parseAggregatedLBREntry();
1727-
if (std::error_code EC = AggrEntry.getError())
1756+
while (hasData())
1757+
if (std::error_code EC = parseAggregatedLBREntry())
17281758
return EC;
17291759

1730-
for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset})
1731-
if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
1732-
BF->setHasProfileAvailable();
1733-
1734-
AggregatedLBRs.emplace_back(std::move(AggrEntry.get()));
1735-
}
1736-
17371760
return std::error_code();
17381761
}
17391762

@@ -1746,8 +1769,9 @@ void DataAggregator::processPreAggregated() {
17461769
for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) {
17471770
switch (AggrEntry.EntryType) {
17481771
case AggregatedLBREntry::BRANCH:
1772+
case AggregatedLBREntry::TRACE:
17491773
doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
1750-
AggrEntry.Mispreds, /*IsPreagg*/ true);
1774+
AggrEntry.Mispreds);
17511775
break;
17521776
case AggregatedLBREntry::FT:
17531777
case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {

bolt/test/X86/callcont-fallthru.s

+31-17
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,43 @@
44
# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
55
## Link against a DSO to ensure PLT entries.
66
# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
7-
# RUN: link_fdata %s %t %t.pa1 PREAGG
7+
# RUN: link_fdata %s %t %t.pa1 PREAGG1
88
# RUN: link_fdata %s %t %t.pa2 PREAGG2
99
# RUN: link_fdata %s %t %t.pa3 PREAGG3
10-
# RUN: link_fdata %s %t %t.pa4 PREAGG4
10+
# RUN: link_fdata %s %t %t.pat PREAGGT1
11+
# RUN: link_fdata %s %t %t.pat2 PREAGGT2
1112

1213
## Check normal case: fallthrough is not LP or secondary entry.
13-
# RUN: llvm-strip --strip-unneeded %t -o %t.exe
14-
# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \
14+
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
15+
# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
16+
# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \
1517
# RUN: --print-cfg --print-only=main | FileCheck %s
1618

1719
## Check that getFallthroughsInTrace correctly handles a trace starting at plt
1820
## call continuation
19-
# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \
21+
# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \
2022
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
2123

2224
## Check that we don't treat secondary entry points as call continuation sites.
2325
# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
2426
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
2527

2628
## Check fallthrough to a landing pad case.
27-
# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out \
28-
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4
29+
# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \
30+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
31+
32+
## Check pre-aggregated traces attach call continuation fallthrough count
33+
# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
34+
# RUN: --print-cfg --print-only=main | FileCheck %s
35+
36+
## Check pre-aggregated traces don't attach call continuation fallthrough count
37+
## to secondary entry point (unstripped)
38+
# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
39+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
40+
## Check pre-aggregated traces don't attach call continuation fallthrough count
41+
## to landing pad (stripped, LP)
42+
# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
43+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
2944

3045
.globl foo
3146
.type foo, %function
@@ -51,8 +66,9 @@ main:
5166
movl %edi, -0x8(%rbp)
5267
movq %rsi, -0x10(%rbp)
5368
callq puts@PLT
54-
## Target is a call continuation
55-
# PREAGG: B X:0 #Ltmp1# 2 0
69+
## Target is an external-origin call continuation
70+
# PREAGG1: B X:0 #Ltmp1# 2 0
71+
# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
5672
# CHECK: callq puts@PLT
5773
# CHECK-NEXT: count: 2
5874

@@ -63,14 +79,16 @@ Ltmp1:
6379

6480
Ltmp4:
6581
cmpl $0x0, -0x14(%rbp)
82+
Ltmp4_br:
6683
je Ltmp0
6784
# CHECK2: je .Ltmp0
6885
# CHECK2-NEXT: count: 3
6986

7087
movl $0xa, -0x18(%rbp)
7188
callq foo
72-
## Target is a call continuation
73-
# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0
89+
## Target is a binary-local call continuation
90+
# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0
91+
# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
7492
# CHECK: callq foo
7593
# CHECK-NEXT: count: 1
7694

@@ -79,16 +97,12 @@ Ltmp4:
7997
# CHECK2: callq foo
8098
# CHECK2-NEXT: count: 3
8199

82-
## Target is a secondary entry point
100+
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
83101
# PREAGG3: B X:0 #Ltmp3# 2 0
102+
# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
84103
# CHECK3: callq foo
85104
# CHECK3-NEXT: count: 0
86105

87-
## Target is a landing pad
88-
# PREAGG4: B X:0 #Ltmp3# 2 0
89-
# CHECK4: callq puts@PLT
90-
# CHECK4-NEXT: count: 0
91-
92106
Ltmp3:
93107
cmpl $0x0, -0x18(%rbp)
94108
Ltmp3_br:

bolt/test/link_fdata.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@
3434
fdata_pat = re.compile(r"([01].*) (?P<exec>\d+) (?P<mispred>\d+)")
3535

3636
# Pre-aggregated profile:
37-
# {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
38-
# [<mispred_count>]
39-
preagg_pat = re.compile(r"(?P<type>[BFf]) (?P<offsets_count>.*)")
37+
# {T|B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> [<ft_end>]
38+
# <count> [<mispred_count>]
39+
preagg_pat = re.compile(r"(?P<type>[TBFf]) (?P<offsets_count>.*)")
4040

4141
# No-LBR profile:
4242
# <is symbol?> <closest elf symbol or DSO name> <relative address> <count>

0 commit comments

Comments
 (0)