Auto merge of rust-lang#15744 - pvalletbo:15395/character-byte-literals-diagnose, r=Veykril

bors · bors · commit c3873616d3d4 · 2023-10-11T10:49:30.000Z
fix: add diagnostics messages for chars and byte literal errors This PR adds error messages for different invalid byte or character literals. Fixes rust-lang#15395
diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs
@@ -9,8 +9,11 @@
 //! include info about comments and whitespace.
 
 use rustc_dependencies::lexer as rustc_lexer;
+
 use std::ops;
 
+use rustc_lexer::unescape::{EscapeError, Mode};
+
 use crate::{
     SyntaxKind::{self, *},
     T,
@@ -254,13 +257,28 @@ impl<'a> Converter<'a> {
             rustc_lexer::LiteralKind::Char { terminated } => {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the character literal";
+                } else {
+                    let text = &self.res.text[self.offset + 1..][..len - 1];
+                    let i = text.rfind('\'').unwrap();
+                    let text = &text[..i];
+                    if let Err(e) = rustc_lexer::unescape::unescape_char(text) {
+                        err = error_to_diagnostic_message(e, Mode::Char);
+                    }
                 }
                 CHAR
             }
             rustc_lexer::LiteralKind::Byte { terminated } => {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the byte literal";
+                } else {
+                    let text = &self.res.text[self.offset + 2..][..len - 2];
+                    let i = text.rfind('\'').unwrap();
+                    let text = &text[..i];
+                    if let Err(e) = rustc_lexer::unescape::unescape_char(text) {
+                        err = error_to_diagnostic_message(e, Mode::Byte);
+                    }
                 }
+
                 BYTE
             }
             rustc_lexer::LiteralKind::Str { terminated } => {
@@ -305,3 +323,40 @@ impl<'a> Converter<'a> {
         self.push(syntax_kind, len, err);
     }
 }
+
+fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
+    match error {
+        EscapeError::ZeroChars => "empty character literal",
+        EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
+        EscapeError::LoneSlash => "",
+        EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => {
+            "unknown byte escape"
+        }
+        EscapeError::InvalidEscape => "unknown character escape",
+        EscapeError::BareCarriageReturn => "",
+        EscapeError::BareCarriageReturnInRawString => "",
+        EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped",
+        EscapeError::EscapeOnlyChar => "character constant must be escaped",
+        EscapeError::TooShortHexEscape => "numeric character escape is too short",
+        EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape",
+        EscapeError::OutOfRangeHexEscape => "out of range hex escape",
+        EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence",
+        EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape",
+        EscapeError::EmptyUnicodeEscape => "empty unicode escape",
+        EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape",
+        EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape",
+        EscapeError::OverlongUnicodeEscape => "overlong unicode escape",
+        EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape",
+        EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape",
+        EscapeError::UnicodeEscapeInByte => "unicode escape in byte string",
+        EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
+            "non-ASCII character in byte literal"
+        }
+        EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
+            "non-ASCII character in byte string literal"
+        }
+        EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
+        EscapeError::UnskippedWhitespaceWarning => "",
+        EscapeError::MultipleSkippedLinesWarning => "",
+    }
+}
diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rast b/crates/parser/test_data/lexer/err/byte_char_literals.rast
@@ -0,0 +1,92 @@
+BYTE "b''" error: empty character literal
+WHITESPACE "\n"
+BYTE "b'\\'" error: Missing trailing `'` symbol to terminate the byte literal
+WHITESPACE "\n"
+BYTE "b'\n'" error: byte constant must be escaped
+WHITESPACE "\n"
+BYTE "b'spam'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\x0ff'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\\"a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\na'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\ra'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\ta'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\\\a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\'a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\0a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\u{0}x'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\u{1F63b}}'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\v'" error: unknown byte escape
+WHITESPACE "\n"
+BYTE "b'\\💩'" error: unknown byte escape
+WHITESPACE "\n"
+BYTE "b'\\●'" error: unknown byte escape
+WHITESPACE "\n"
+BYTE "b'\\\\\\r'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\x'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\x0'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\xf'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\xa'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\xx'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\xы'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\x🦀'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\xtt'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\xff'" error: out of range hex escape
+WHITESPACE "\n"
+BYTE "b'\\xFF'" error: out of range hex escape
+WHITESPACE "\n"
+BYTE "b'\\x80'" error: out of range hex escape
+WHITESPACE "\n"
+BYTE "b'\\u'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+BYTE "b'\\u[0123]'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+BYTE "b'\\u{0x}'" error: invalid character in unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{'" error: unterminated unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{0000'" error: unterminated unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{}'" error: empty unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{_0000}'" error: invalid start of unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{0000000}'" error: overlong unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{FFFFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DC00}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DDDD}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{D800}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DAAA}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DBFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rs b/crates/parser/test_data/lexer/err/byte_char_literals.rs
@@ -0,0 +1,47 @@
+b''
+b'\'
+b'
+'
+b'spam'
+b'\x0ff'
+b'\"a'
+b'\na'
+b'\ra'
+b'\ta'
+b'\\a'
+b'\'a'
+b'\0a'
+b'\u{0}x'
+b'\u{1F63b}}'
+b'\v'
+b'\💩'
+b'\●'
+b'\\\r'
+b'\x'
+b'\x0'
+b'\xf'
+b'\xa'
+b'\xx'
+b'\xы'
+b'\x🦀'
+b'\xtt'
+b'\xff'
+b'\xFF'
+b'\x80'
+b'\u'
+b'\u[0123]'
+b'\u{0x}'
+b'\u{'
+b'\u{0000'
+b'\u{}'
+b'\u{_0000}'
+b'\u{0000000}'
+b'\u{FFFFFF}'
+b'\u{ffffff}'
+b'\u{ffffff}'
+b'\u{DC00}'
+b'\u{DDDD}'
+b'\u{DFFF}'
+b'\u{D800}'
+b'\u{DAAA}'
+b'\u{DBFF}'
diff --git a/crates/parser/test_data/lexer/err/char_literals.rast b/crates/parser/test_data/lexer/err/char_literals.rast
@@ -0,0 +1,92 @@
+CHAR "'hello'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "''" error: empty character literal
+WHITESPACE "\n"
+CHAR "'\n'" error: character constant must be escaped
+WHITESPACE "\n"
+CHAR "'spam'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\x0ff'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\\"a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\na'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\ra'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\ta'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\\\a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\'a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\0a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\u{0}x'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\u{1F63b}}'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\v'" error: unknown character escape
+WHITESPACE "\n"
+CHAR "'\\💩'" error: unknown character escape
+WHITESPACE "\n"
+CHAR "'\\●'" error: unknown character escape
+WHITESPACE "\n"
+CHAR "'\\\\\\r'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\x'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\x0'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\xf'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\xa'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\xx'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\xы'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\x🦀'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\xtt'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\xff'" error: out of range hex escape
+WHITESPACE "\n"
+CHAR "'\\xFF'" error: out of range hex escape
+WHITESPACE "\n"
+CHAR "'\\x80'" error: out of range hex escape
+WHITESPACE "\n"
+CHAR "'\\u'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+CHAR "'\\u[0123]'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+CHAR "'\\u{0x}'" error: invalid character in unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{'" error: unterminated unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{0000'" error: unterminated unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{}'" error: empty unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{_0000}'" error: invalid start of unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{0000000}'" error: overlong unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{FFFFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DC00}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DDDD}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{D800}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DAAA}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DBFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
diff --git a/crates/parser/test_data/lexer/err/char_literals.rs b/crates/parser/test_data/lexer/err/char_literals.rs
@@ -0,0 +1,47 @@
+'hello'
+''
+'
+'
+'spam'
+'\x0ff'
+'\"a'
+'\na'
+'\ra'
+'\ta'
+'\\a'
+'\'a'
+'\0a'
+'\u{0}x'
+'\u{1F63b}}'
+'\v'
+'\💩'
+'\●'
+'\\\r'
+'\x'
+'\x0'
+'\xf'
+'\xa'
+'\xx'
+'\xы'
+'\x🦀'
+'\xtt'
+'\xff'
+'\xFF'
+'\x80'
+'\u'
+'\u[0123]'
+'\u{0x}'
+'\u{'
+'\u{0000'
+'\u{}'
+'\u{_0000}'
+'\u{0000000}'
+'\u{FFFFFF}'
+'\u{ffffff}'
+'\u{ffffff}'
+'\u{DC00}'
+'\u{DDDD}'
+'\u{DFFF}'
+'\u{D800}'
+'\u{DAAA}'
+'\u{DBFF}'
diff --git a/crates/parser/test_data/lexer/ok/byte_strings.rast b/crates/parser/test_data/lexer/ok/byte_strings.rast
@@ -1,13 +1,9 @@
-BYTE "b''"
-WHITESPACE " "
 BYTE "b'x'"
 WHITESPACE " "
 BYTE_STRING "b\"foo\""
 WHITESPACE " "
 BYTE_STRING "br\"\""
 WHITESPACE "\n"
-BYTE "b''suf"
-WHITESPACE " "
 BYTE_STRING "b\"\"ix"
 WHITESPACE " "
 BYTE_STRING "br\"\"br"
@@ -17,6 +13,4 @@ WHITESPACE " "
 BYTE "b'\\\\'"
 WHITESPACE " "
 BYTE "b'\\''"
-WHITESPACE " "
-BYTE "b'hello'"
 WHITESPACE "\n"
diff --git a/crates/parser/test_data/lexer/ok/byte_strings.rs b/crates/parser/test_data/lexer/ok/byte_strings.rs
@@ -1,3 +1,3 @@
-b'' b'x' b"foo" br""
-b''suf b""ix br""br
-b'\n' b'\\' b'\'' b'hello'
+b'x' b"foo" br""
+b""ix br""br
+b'\n' b'\\' b'\''
diff --git a/crates/parser/test_data/lexer/ok/chars.rast b/crates/parser/test_data/lexer/ok/chars.rast
diff --git a/crates/parser/test_data/lexer/ok/chars.rs b/crates/parser/test_data/lexer/ok/chars.rs