Testing: Correct perf parsing of C++ output

marshallward · adcroft · commit 6272bbcdc670 · 2024-05-15T14:16:38.000-04:00
This patch fixes errors in the parser of perf output.  Previously,
each record was assumed to be separated by spaces, but this failed for
more generic records (usually from C++) which included signatures (such
as `f(a, b)`) or templates (`f&lt;a, b&gt;`).  Nested constructs were also
possible.

This is fixed by introducing a simple tokenizer which extracts &lt;, (, and
whitespace from the output , then rebuilds the records by combining any
whitespace which appears inside of delimiters.

This patch should hopefully resolve the CI errors in GitHub Actions.
diff --git a/.testing/tools/parse_perf.py b/.testing/tools/parse_perf.py
@@ -3,10 +3,20 @@
 import collections
 import json
 import os
+import re
 import shlex
 import subprocess
 import sys
 
+perf_scanner = re.Scanner([
+  (r'<', lambda scanner, token: token),
+  (r'>', lambda scanner, token: token),
+  (r'\(', lambda scanner, token: token),
+  (r'\)', lambda scanner, token: token),
+  (r'[ \t]+', lambda scanner, token: token),
+  (r'[^<>() \t]+', lambda scanner, token: token),
+])
+
 
 def main():
     desc = 'Parse perf.data and return in JSON format.'
@@ -58,15 +68,55 @@ def parse_perf_report(perf_data_path):
 
             # get per-symbol count
             else:
+                tokens, remainder = perf_scanner.scan(line)
+                if remainder:
+                    print('Line could not be tokenized', file=sys.stderr)
+                    print(' line:', repr(line), file=sys.stderr)
+                    print(' tokens:', tokens, file=sys.stderr)
+                    print(' remainder:', remainder, file=sys.stderr)
+                    sys.exit(os.EX_DATAERR)
+
+                # Construct record from tokens
+                # (NOTE: Not a proper grammar, just dumb bracket counting)
+                record = []
+                bracks = 0
+                parens = 0
+
+                for tok in tokens:
+                    if tok == '<':
+                        bracks += 1
+
+                    if tok == '(':
+                        parens += 1
+
+                    rec = record[-1] if record else None
+
+                    inside_bracket = rec and (bracks > 0 or parens > 0)
+                    lead_rec = tok in '<(' and rec and not rec.isspace()
+                    tail_rec = not tok.isspace() and rec and rec[-1] in '>)'
+
+                    if inside_bracket or lead_rec or tail_rec:
+                        record[-1] += tok
+                    else:
+                        record.append(tok)
+
+                    if tok == '>':
+                        bracks -= 1
+                    if tok == '(':
+                        parens -= 1
+
+                # Strip any whitespace tokens
+                record = [rec for rec in record if not rec.isspace()]
+
                 try:
-                    tokens = line.split()
-                    symbol = tokens[2]
-                    period = int(tokens[3])
-                except ValueError:
+                    symbol = record[2]
+                    period = int(record[3])
+                except:
                     print("parse_perf.py: Error extracting symbol count",
-                            file=sys.stderr)
+                          file=sys.stderr)
                     print("line:", repr(line), file=sys.stderr)
                     print("tokens:", tokens, file=sys.stderr)
+                    print("record:", record, file=sys.stderr)
                     raise
 
                 profile[event_name]['symbol'][symbol] = period