Skip to content

Commit c5369eb

Browse files
committed
Add ffi::OsString and OsStr
Per [RFC 517](rust-lang/rfcs#575), this commit introduces platform-native strings. The API is essentially as described in the RFC. The WTF-8 implementation is adapted from @SimonSapin's [implementation](https://github.com/SimonSapin/rust-wtf8). To make this work, some encodign and decoding functionality in `libcore` is now exported in a "raw" fashion reusable for WTF-8. These exports are *not* reexported in `std`, nor are they stable.
1 parent d8d5e4d commit c5369eb

File tree

12 files changed

+1850
-92
lines changed

12 files changed

+1850
-92
lines changed

src/libcore/char.rs

+58-38
Original file line numberDiff line numberDiff line change
@@ -258,49 +258,69 @@ impl CharExt for char {
258258
#[inline]
259259
#[unstable = "pending decision about Iterator/Writer/Reader"]
260260
fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> {
261-
// Marked #[inline] to allow llvm optimizing it away
262-
let code = self as u32;
263-
if code < MAX_ONE_B && dst.len() >= 1 {
264-
dst[0] = code as u8;
265-
Some(1)
266-
} else if code < MAX_TWO_B && dst.len() >= 2 {
267-
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
268-
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
269-
Some(2)
270-
} else if code < MAX_THREE_B && dst.len() >= 3 {
271-
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
272-
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
273-
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
274-
Some(3)
275-
} else if dst.len() >= 4 {
276-
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
277-
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
278-
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
279-
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
280-
Some(4)
281-
} else {
282-
None
283-
}
261+
encode_utf8_raw(self as u32, dst)
284262
}
285263

286264
#[inline]
287265
#[unstable = "pending decision about Iterator/Writer/Reader"]
288266
fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> {
289-
// Marked #[inline] to allow llvm optimizing it away
290-
let mut ch = self as u32;
291-
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
292-
// The BMP falls through (assuming non-surrogate, as it should)
293-
dst[0] = ch as u16;
294-
Some(1)
295-
} else if dst.len() >= 2 {
296-
// Supplementary planes break into surrogates.
297-
ch -= 0x1_0000_u32;
298-
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
299-
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
300-
Some(2)
301-
} else {
302-
None
303-
}
267+
encode_utf16_raw(self as u32, dst)
268+
}
269+
}
270+
271+
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
272+
/// and then returns the number of bytes written.
273+
///
274+
/// If the buffer is not large enough, nothing will be written into it
275+
/// and a `None` will be returned.
276+
#[inline]
277+
#[unstable]
278+
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<uint> {
279+
// Marked #[inline] to allow llvm optimizing it away
280+
if code < MAX_ONE_B && dst.len() >= 1 {
281+
dst[0] = code as u8;
282+
Some(1)
283+
} else if code < MAX_TWO_B && dst.len() >= 2 {
284+
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
285+
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
286+
Some(2)
287+
} else if code < MAX_THREE_B && dst.len() >= 3 {
288+
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
289+
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
290+
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
291+
Some(3)
292+
} else if dst.len() >= 4 {
293+
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
294+
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
295+
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
296+
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
297+
Some(4)
298+
} else {
299+
None
300+
}
301+
}
302+
303+
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
304+
/// and then returns the number of `u16`s written.
305+
///
306+
/// If the buffer is not large enough, nothing will be written into it
307+
/// and a `None` will be returned.
308+
#[inline]
309+
#[unstable]
310+
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<uint> {
311+
// Marked #[inline] to allow llvm optimizing it away
312+
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
313+
// The BMP falls through (assuming non-surrogate, as it should)
314+
dst[0] = ch as u16;
315+
Some(1)
316+
} else if dst.len() >= 2 {
317+
// Supplementary planes break into surrogates.
318+
ch -= 0x1_0000_u32;
319+
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
320+
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
321+
Some(2)
322+
} else {
323+
None
304324
}
305325
}
306326

src/libcore/str/mod.rs

+67-49
Original file line numberDiff line numberDiff line change
@@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
305305
}
306306
}
307307

308+
/// Reads the next code point out of a byte iterator (assuming a
309+
/// UTF-8-like encoding).
310+
#[unstable]
311+
pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
312+
// Decode UTF-8
313+
let x = match bytes.next() {
314+
None => return None,
315+
Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
316+
Some(&next_byte) => next_byte,
317+
};
318+
319+
// Multibyte case follows
320+
// Decode from a byte combination out of: [[[x y] z] w]
321+
// NOTE: Performance is sensitive to the exact formulation here
322+
let init = utf8_first_byte!(x, 2);
323+
let y = unwrap_or_0(bytes.next());
324+
let mut ch = utf8_acc_cont_byte!(init, y);
325+
if x >= 0xE0 {
326+
// [[x y z] w] case
327+
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
328+
let z = unwrap_or_0(bytes.next());
329+
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
330+
ch = init << 12 | y_z;
331+
if x >= 0xF0 {
332+
// [x y z w] case
333+
// use only the lower 3 bits of `init`
334+
let w = unwrap_or_0(bytes.next());
335+
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
336+
}
337+
}
338+
339+
Some(ch)
340+
}
341+
308342
#[stable]
309343
impl<'a> Iterator for Chars<'a> {
310344
type Item = char;
311345

312346
#[inline]
313347
fn next(&mut self) -> Option<char> {
314-
// Decode UTF-8, using the valid UTF-8 invariant
315-
let x = match self.iter.next() {
316-
None => return None,
317-
Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
318-
Some(&next_byte) => next_byte,
319-
};
320-
321-
// Multibyte case follows
322-
// Decode from a byte combination out of: [[[x y] z] w]
323-
// NOTE: Performance is sensitive to the exact formulation here
324-
let init = utf8_first_byte!(x, 2);
325-
let y = unwrap_or_0(self.iter.next());
326-
let mut ch = utf8_acc_cont_byte!(init, y);
327-
if x >= 0xE0 {
328-
// [[x y z] w] case
329-
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
330-
let z = unwrap_or_0(self.iter.next());
331-
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
332-
ch = init << 12 | y_z;
333-
if x >= 0xF0 {
334-
// [x y z w] case
335-
// use only the lower 3 bits of `init`
336-
let w = unwrap_or_0(self.iter.next());
337-
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
348+
next_code_point(&mut self.iter).map(|ch| {
349+
// str invariant says `ch` is a valid Unicode Scalar Value
350+
unsafe {
351+
mem::transmute(ch)
338352
}
339-
}
340-
341-
// str invariant says `ch` is a valid Unicode Scalar Value
342-
unsafe {
343-
Some(mem::transmute(ch))
344-
}
353+
})
345354
}
346355

347356
#[inline]
@@ -1517,25 +1526,8 @@ impl StrExt for str {
15171526

15181527
#[inline]
15191528
fn char_range_at(&self, i: uint) -> CharRange {
1520-
if self.as_bytes()[i] < 128u8 {
1521-
return CharRange {ch: self.as_bytes()[i] as char, next: i + 1 };
1522-
}
1523-
1524-
// Multibyte case is a fn to allow char_range_at to inline cleanly
1525-
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1526-
let mut val = s.as_bytes()[i] as u32;
1527-
let w = UTF8_CHAR_WIDTH[val as uint] as uint;
1528-
assert!((w != 0));
1529-
1530-
val = utf8_first_byte!(val, w);
1531-
val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 1]);
1532-
if w > 2 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 2]); }
1533-
if w > 3 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 3]); }
1534-
1535-
return CharRange {ch: unsafe { mem::transmute(val) }, next: i + w};
1536-
}
1537-
1538-
return multibyte_char_range_at(self, i);
1529+
let (c, n) = char_range_at_raw(self.as_bytes(), i);
1530+
CharRange { ch: unsafe { mem::transmute(c) }, next: n }
15391531
}
15401532

15411533
#[inline]
@@ -1653,6 +1645,32 @@ impl StrExt for str {
16531645
fn parse<T: FromStr>(&self) -> Option<T> { FromStr::from_str(self) }
16541646
}
16551647

1648+
/// Pluck a code point out of a UTF-8-like byte slice and return the
1649+
/// index of the next code point.
1650+
#[inline]
1651+
#[unstable]
1652+
pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) {
1653+
if bytes[i] < 128u8 {
1654+
return (bytes[i] as u32, i + 1);
1655+
}
1656+
1657+
// Multibyte case is a fn to allow char_range_at to inline cleanly
1658+
fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) {
1659+
let mut val = bytes[i] as u32;
1660+
let w = UTF8_CHAR_WIDTH[val as uint] as uint;
1661+
assert!((w != 0));
1662+
1663+
val = utf8_first_byte!(val, w);
1664+
val = utf8_acc_cont_byte!(val, bytes[i + 1]);
1665+
if w > 2 { val = utf8_acc_cont_byte!(val, bytes[i + 2]); }
1666+
if w > 3 { val = utf8_acc_cont_byte!(val, bytes[i + 3]); }
1667+
1668+
return (val, i + w);
1669+
}
1670+
1671+
multibyte_char_range_at(bytes, i)
1672+
}
1673+
16561674
#[stable]
16571675
impl<'a> Default for &'a str {
16581676
#[stable]

src/libstd/ffi/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,9 @@ pub use self::c_str::CString;
1717
pub use self::c_str::c_str_to_bytes;
1818
pub use self::c_str::c_str_to_bytes_with_nul;
1919

20+
pub use self::os_str::OsString;
21+
pub use self::os_str::OsStr;
22+
pub use self::os_str::AsOsStr;
23+
2024
mod c_str;
25+
mod os_str;

0 commit comments

Comments
 (0)