Skip to content

Commit 8c7e0e1

Browse files
committed
Auto merge of rust-lang#128200 - estebank:normalize-whitespace, r=pnkfelix
Change output normalization logic to be linear against size of output Modify the rendered output normalization routine to scan each character *once* and construct a `String` to be printed out to the terminal *once*, instead of using `String::replace` in a loop multiple times. The output doesn't change, but the time spent to prepare a diagnostic is now faster (or rather, closer to what it was before rust-lang#127528).
2 parents c9687a9 + 51b5bb1 commit 8c7e0e1

File tree

1 file changed

+38
-30
lines changed

1 file changed

+38
-30
lines changed

compiler/rustc_errors/src/emitter.rs

+38-30
Original file line numberDiff line numberDiff line change
@@ -2564,22 +2564,13 @@ fn num_decimal_digits(num: usize) -> usize {
25642564

25652565
// We replace some characters so the CLI output is always consistent and underlines aligned.
25662566
// Keep the following list in sync with `rustc_span::char_width`.
2567+
// ATTENTION: keep lexicografically sorted so that the binary search will work
25672568
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2568-
('\t', " "), // We do our own tab replacement
2569-
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2570-
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
2571-
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2572-
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
2573-
('\u{202E}', "�"),
2574-
('\u{2066}', "�"),
2575-
('\u{2067}', "�"),
2576-
('\u{2068}', "�"),
2577-
('\u{202C}', "�"),
2578-
('\u{2069}', "�"),
2569+
// tidy-alphabetical-start
25792570
// In terminals without Unicode support the following will be garbled, but in *all* terminals
25802571
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
25812572
// support" gate.
2582-
('\u{0000}', "␀"),
2573+
('\0', "␀"),
25832574
('\u{0001}', "␁"),
25842575
('\u{0002}', "␂"),
25852576
('\u{0003}', "␃"),
@@ -2588,11 +2579,12 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
25882579
('\u{0006}', "␆"),
25892580
('\u{0007}', "␇"),
25902581
('\u{0008}', "␈"),
2591-
('\u{000B}', "␋"),
2592-
('\u{000C}', "␌"),
2593-
('\u{000D}', "␍"),
2594-
('\u{000E}', "␎"),
2595-
('\u{000F}', "␏"),
2582+
('\u{0009}', " "), // We do our own tab replacement
2583+
('\u{000b}', "␋"),
2584+
('\u{000c}', "␌"),
2585+
('\u{000d}', "␍"),
2586+
('\u{000e}', "␎"),
2587+
('\u{000f}', "␏"),
25962588
('\u{0010}', "␐"),
25972589
('\u{0011}', "␑"),
25982590
('\u{0012}', "␒"),
@@ -2603,21 +2595,37 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
26032595
('\u{0017}', "␗"),
26042596
('\u{0018}', "␘"),
26052597
('\u{0019}', "␙"),
2606-
('\u{001A}', "␚"),
2607-
('\u{001B}', "␛"),
2608-
('\u{001C}', "␜"),
2609-
('\u{001D}', "␝"),
2610-
('\u{001E}', "␞"),
2611-
('\u{001F}', "␟"),
2612-
('\u{007F}', "␡"),
2598+
('\u{001a}', "␚"),
2599+
('\u{001b}', "␛"),
2600+
('\u{001c}', "␜"),
2601+
('\u{001d}', "␝"),
2602+
('\u{001e}', "␞"),
2603+
('\u{001f}', "␟"),
2604+
('\u{007f}', "␡"),
2605+
('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
2606+
('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
2607+
('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2608+
('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
2609+
('\u{202d}', "�"),
2610+
('\u{202e}', "�"),
2611+
('\u{2066}', "�"),
2612+
('\u{2067}', "�"),
2613+
('\u{2068}', "�"),
2614+
('\u{2069}', "�"),
2615+
// tidy-alphabetical-end
26132616
];
26142617

2615-
fn normalize_whitespace(str: &str) -> String {
2616-
let mut s = str.to_string();
2617-
for (c, replacement) in OUTPUT_REPLACEMENTS {
2618-
s = s.replace(*c, replacement);
2619-
}
2620-
s
2618+
fn normalize_whitespace(s: &str) -> String {
2619+
// Scan the input string for a character in the ordered table above. If it's present, replace
2620+
// it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
2621+
// char. At the end, allocate all chars into a string in one operation.
2622+
s.chars().fold(String::with_capacity(s.len()), |mut s, c| {
2623+
match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
2624+
Ok(i) => s.push_str(OUTPUT_REPLACEMENTS[i].1),
2625+
_ => s.push(c),
2626+
}
2627+
s
2628+
})
26212629
}
26222630

26232631
fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {

0 commit comments

Comments
 (0)