Skip to content

Commit

Permalink
Allow access to OsStr bytes
Browse files Browse the repository at this point in the history
`OsStr` has historically kept its implementation details private out of
concern for locking us into a specific encoding on Windows.

This is an alternative to rust-lang#95290 which proposed specifying the encoding on Windows.  Instead, this
only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines
rules for safely interacting with it

At minimum, this can greatly simplify the `os_str_bytes` crate and every
arg parser that interacts with `OsStr` directly (which is most of those
that support invalid UTF-8).
  • Loading branch information
epage committed Mar 28, 2023
1 parent 70e04bd commit 8d2beb5
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 9 deletions.
8 changes: 8 additions & 0 deletions library/std/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@
//! trait, which provides a [`from_wide`] method to convert a native Windows
//! string (without the terminating nul character) to an [`OsString`].
//!
//! ## On all platforms
//!
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
//!
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
//! [`OsStr::from_os_str_bytes_unchecked`].
//!
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
//! [`env::set_var()`]: crate::env::set_var "env::set_var"
Expand Down
56 changes: 55 additions & 1 deletion library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,45 @@ impl OsStr {
s.as_ref()
}

/// Converts a slice of bytes to an OS string slice without checking that the string contains
/// valid `OsStr`-encoded data.
///
/// See the [module's toplevel documentation about conversions][conversions] for safe,
/// cross-platform [conversions] from/to native representations.
///
/// # Safety
///
/// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must
/// pass in bytes that originated as a mixture of validated UTF-8 and bytes from
/// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target
/// platform. The bytes from `OsStr::as_os_str_bytes` may be split either
/// immediately before or immediately after some valid non-empty UTF-8 substring
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
}

#[inline]
fn from_inner(inner: &Slice) -> &OsStr {
// SAFETY: OsStr is just a wrapper of Slice,
Expand Down Expand Up @@ -837,13 +876,28 @@ impl OsStr {
OsString { inner: Buf::from_box(boxed) }
}

/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
///
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
/// be treated as opaque and only comparable within the same rust version built for the same
/// target platform. See [`OsString`] for more encoding details and [`std::ffi`] for
/// platform-specific, specified conversions.
///
/// [`std::ffi`]: crate::ffi
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn as_os_str_bytes(&self) -> &[u8] {
self.inner.as_os_str_bytes()
}

/// Gets the underlying byte representation.
///
/// Note: it is *crucial* that this API is not externally public, to avoid
/// revealing the internal, platform-specific encodings.
#[inline]
pub(crate) fn bytes(&self) -> &[u8] {
unsafe { &*(&self.inner as *const _ as *const [u8]) }
self.as_os_str_bytes()
}

/// Converts this string to its ASCII lower case equivalent in-place.
Expand Down
9 changes: 7 additions & 2 deletions library/std/src/sys/unix/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,18 @@ impl Buf {

impl Slice {
#[inline]
fn from_u8_slice(s: &[u8]) -> &Slice {
pub fn as_os_str_bytes(&self) -> &[u8] {
&self.inner
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
unsafe { mem::transmute(s) }
}

#[inline]
pub fn from_str(s: &str) -> &Slice {
Slice::from_u8_slice(s.as_bytes())
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
}

pub fn to_str(&self) -> Option<&str> {
Expand Down
9 changes: 4 additions & 5 deletions library/std/src/sys/unix/os_str/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::*;

#[test]
fn slice_debug_output() {
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
let expected = r#""\xF0hello,\tworld""#;
let output = format!("{input:?}");

Expand All @@ -11,8 +11,7 @@ fn slice_debug_output() {

#[test]
fn display() {
assert_eq!(
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
);
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
},);
}
10 changes: 10 additions & 0 deletions library/std/src/sys/windows/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,16 @@ impl Buf {
}

impl Slice {
#[inline]
pub fn as_os_str_bytes(&self) -> &[u8] {
self.inner.as_bytes()
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
mem::transmute(Wtf8::from_bytes_unchecked(s))
}

#[inline]
pub fn from_str(s: &str) -> &Slice {
unsafe { mem::transmute(Wtf8::from_str(s)) }
Expand Down
8 changes: 7 additions & 1 deletion library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ impl Wtf8 {
/// Since the byte slice is not checked for valid WTF-8, this functions is
/// marked unsafe.
#[inline]
unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
mem::transmute(value)
}

Expand Down Expand Up @@ -614,6 +614,12 @@ impl Wtf8 {
Wtf8CodePoints { bytes: self.bytes.iter() }
}

/// Access raw bytes of WTF-8 data
#[inline]
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
}

/// Tries to convert the string to UTF-8 and return a `&str` slice.
///
/// Returns `None` if the string contains surrogates.
Expand Down

0 comments on commit 8d2beb5

Please sign in to comment.