Skip to content

Commit 0fb6e63

Browse files
committed
encode_utf8_raw is not always valid UTF-8; clarify comments
1 parent 9c627c3 commit 0fb6e63

File tree

2 files changed

+13
-8
lines changed

2 files changed

+13
-8
lines changed

src/libcore/char/methods.rs

+12-7
Original file line numberDiff line numberDiff line change
@@ -661,7 +661,8 @@ impl char {
661661
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
662662
#[inline]
663663
pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
664-
encode_utf8_raw(self as u32, dst)
664+
// SAFETY: `char` is not a surrogate, so this is valid UTF-8.
665+
unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) }
665666
}
666667

667668
/// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -1631,7 +1632,11 @@ fn len_utf8(code: u32) -> usize {
16311632
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
16321633
/// and then returns the subslice of the buffer that contains the encoded character.
16331634
///
1634-
/// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range.
1635+
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1636+
/// (Creating a `char` in the surrogate range is UB.)
1637+
/// The result is valid [generalized UTF-8] but not valid UTF-8.
1638+
///
1639+
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
16351640
///
16361641
/// # Panics
16371642
///
@@ -1640,7 +1645,7 @@ fn len_utf8(code: u32) -> usize {
16401645
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
16411646
#[doc(hidden)]
16421647
#[inline]
1643-
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
1648+
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
16441649
let len = len_utf8(code);
16451650
match (len, &mut dst[..]) {
16461651
(1, [a, ..]) => {
@@ -1668,14 +1673,14 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
16681673
dst.len(),
16691674
),
16701675
};
1671-
// SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
1672-
unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
1676+
&mut dst[..len]
16731677
}
16741678

16751679
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
16761680
/// and then returns the subslice of the buffer that contains the encoded character.
16771681
///
1678-
/// Unlike `char::encode_utf16`, this method can be called on codepoints in the surrogate range.
1682+
/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
1683+
/// (Creating a `char` in the surrogate range is UB.)
16791684
///
16801685
/// # Panics
16811686
///
@@ -1688,7 +1693,7 @@ pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
16881693
// SAFETY: each arm checks whether there are enough bits to write into
16891694
unsafe {
16901695
if (code & 0xFFFF) == code && !dst.is_empty() {
1691-
// The BMP falls through (assuming non-surrogate, as it should)
1696+
// The BMP falls through
16921697
*dst.get_unchecked_mut(0) = code as u16;
16931698
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
16941699
} else if dst.len() >= 2 {

src/libstd/sys_common/wtf8.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ impl Wtf8Buf {
202202
/// This does **not** include the WTF-8 concatenation check.
203203
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
204204
let mut bytes = [0; 4];
205-
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes).as_bytes();
205+
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
206206
self.bytes.extend_from_slice(bytes)
207207
}
208208

0 commit comments

Comments
 (0)