@@ -661,7 +661,8 @@ impl char {
661
661
#[ stable( feature = "unicode_encode_char" , since = "1.15.0" ) ]
662
662
#[ inline]
663
663
pub fn encode_utf8 ( self , dst : & mut [ u8 ] ) -> & mut str {
664
- encode_utf8_raw ( self as u32 , dst)
664
+ // SAFETY: `char` is not a surrogate, so this is valid UTF-8.
665
+ unsafe { from_utf8_unchecked_mut ( encode_utf8_raw ( self as u32 , dst) ) }
665
666
}
666
667
667
668
/// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -1631,7 +1632,11 @@ fn len_utf8(code: u32) -> usize {
1631
1632
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
1632
1633
/// and then returns the subslice of the buffer that contains the encoded character.
1633
1634
///
1634
- /// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range.
1635
+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1636
+ /// (Creating a `char` in the surrogate range is UB.)
1637
+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1638
+ ///
1639
+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1635
1640
///
1636
1641
/// # Panics
1637
1642
///
@@ -1640,7 +1645,7 @@ fn len_utf8(code: u32) -> usize {
1640
1645
#[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1641
1646
#[ doc( hidden) ]
1642
1647
#[ inline]
1643
- pub fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut str {
1648
+ pub fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
1644
1649
let len = len_utf8 ( code) ;
1645
1650
match ( len, & mut dst[ ..] ) {
1646
1651
( 1 , [ a, ..] ) => {
@@ -1668,14 +1673,14 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
1668
1673
dst. len( ) ,
1669
1674
) ,
1670
1675
} ;
1671
- // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
1672
- unsafe { from_utf8_unchecked_mut ( & mut dst[ ..len] ) }
1676
+ & mut dst[ ..len]
1673
1677
}
1674
1678
1675
1679
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
1676
1680
/// and then returns the subslice of the buffer that contains the encoded character.
1677
1681
///
1678
- /// Unlike `char::encode_utf16`, this method can be called on codepoints in the surrogate range.
1682
+ /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
1683
+ /// (Creating a `char` in the surrogate range is UB.)
1679
1684
///
1680
1685
/// # Panics
1681
1686
///
@@ -1688,7 +1693,7 @@ pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
1688
1693
// SAFETY: each arm checks whether there are enough bits to write into
1689
1694
unsafe {
1690
1695
if ( code & 0xFFFF ) == code && !dst. is_empty ( ) {
1691
- // The BMP falls through (assuming non-surrogate, as it should)
1696
+ // The BMP falls through
1692
1697
* dst. get_unchecked_mut ( 0 ) = code as u16 ;
1693
1698
slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 1 )
1694
1699
} else if dst. len ( ) >= 2 {
0 commit comments