@@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
305
305
}
306
306
}
307
307
308
+ /// Reads the next code point out of a byte iterator (assuming a
309
+ /// UTF-8-like encoding).
310
+ #[ unstable]
311
+ pub fn next_code_point ( bytes : & mut slice:: Iter < u8 > ) -> Option < u32 > {
312
+ // Decode UTF-8
313
+ let x = match bytes. next ( ) {
314
+ None => return None ,
315
+ Some ( & next_byte) if next_byte < 128 => return Some ( next_byte as u32 ) ,
316
+ Some ( & next_byte) => next_byte,
317
+ } ;
318
+
319
+ // Multibyte case follows
320
+ // Decode from a byte combination out of: [[[x y] z] w]
321
+ // NOTE: Performance is sensitive to the exact formulation here
322
+ let init = utf8_first_byte ! ( x, 2 ) ;
323
+ let y = unwrap_or_0 ( bytes. next ( ) ) ;
324
+ let mut ch = utf8_acc_cont_byte ! ( init, y) ;
325
+ if x >= 0xE0 {
326
+ // [[x y z] w] case
327
+ // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
328
+ let z = unwrap_or_0 ( bytes. next ( ) ) ;
329
+ let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
330
+ ch = init << 12 | y_z;
331
+ if x >= 0xF0 {
332
+ // [x y z w] case
333
+ // use only the lower 3 bits of `init`
334
+ let w = unwrap_or_0 ( bytes. next ( ) ) ;
335
+ ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
336
+ }
337
+ }
338
+
339
+ Some ( ch)
340
+ }
341
+
308
342
#[ stable]
309
343
impl < ' a > Iterator for Chars < ' a > {
310
344
type Item = char ;
311
345
312
346
#[ inline]
313
347
fn next ( & mut self ) -> Option < char > {
314
- // Decode UTF-8, using the valid UTF-8 invariant
315
- let x = match self . iter . next ( ) {
316
- None => return None ,
317
- Some ( & next_byte) if next_byte < 128 => return Some ( next_byte as char ) ,
318
- Some ( & next_byte) => next_byte,
319
- } ;
320
-
321
- // Multibyte case follows
322
- // Decode from a byte combination out of: [[[x y] z] w]
323
- // NOTE: Performance is sensitive to the exact formulation here
324
- let init = utf8_first_byte ! ( x, 2 ) ;
325
- let y = unwrap_or_0 ( self . iter . next ( ) ) ;
326
- let mut ch = utf8_acc_cont_byte ! ( init, y) ;
327
- if x >= 0xE0 {
328
- // [[x y z] w] case
329
- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
330
- let z = unwrap_or_0 ( self . iter . next ( ) ) ;
331
- let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
332
- ch = init << 12 | y_z;
333
- if x >= 0xF0 {
334
- // [x y z w] case
335
- // use only the lower 3 bits of `init`
336
- let w = unwrap_or_0 ( self . iter . next ( ) ) ;
337
- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
348
+ next_code_point ( & mut self . iter ) . map ( |ch| {
349
+ // str invariant says `ch` is a valid Unicode Scalar Value
350
+ unsafe {
351
+ mem:: transmute ( ch)
338
352
}
339
- }
340
-
341
- // str invariant says `ch` is a valid Unicode Scalar Value
342
- unsafe {
343
- Some ( mem:: transmute ( ch) )
344
- }
353
+ } )
345
354
}
346
355
347
356
#[ inline]
@@ -1517,25 +1526,8 @@ impl StrExt for str {
1517
1526
1518
1527
#[ inline]
1519
1528
fn char_range_at ( & self , i : uint ) -> CharRange {
1520
- if self . as_bytes ( ) [ i] < 128u8 {
1521
- return CharRange { ch : self . as_bytes ( ) [ i] as char , next : i + 1 } ;
1522
- }
1523
-
1524
- // Multibyte case is a fn to allow char_range_at to inline cleanly
1525
- fn multibyte_char_range_at ( s : & str , i : uint ) -> CharRange {
1526
- let mut val = s. as_bytes ( ) [ i] as u32 ;
1527
- let w = UTF8_CHAR_WIDTH [ val as uint ] as uint ;
1528
- assert ! ( ( w != 0 ) ) ;
1529
-
1530
- val = utf8_first_byte ! ( val, w) ;
1531
- val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 1 ] ) ;
1532
- if w > 2 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 2 ] ) ; }
1533
- if w > 3 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 3 ] ) ; }
1534
-
1535
- return CharRange { ch : unsafe { mem:: transmute ( val) } , next : i + w} ;
1536
- }
1537
-
1538
- return multibyte_char_range_at ( self , i) ;
1529
+ let ( c, n) = char_range_at_raw ( self . as_bytes ( ) , i) ;
1530
+ CharRange { ch : unsafe { mem:: transmute ( c) } , next : n }
1539
1531
}
1540
1532
1541
1533
#[ inline]
@@ -1653,6 +1645,32 @@ impl StrExt for str {
1653
1645
fn parse < T : FromStr > ( & self ) -> Option < T > { FromStr :: from_str ( self ) }
1654
1646
}
1655
1647
1648
+ /// Pluck a code point out of a UTF-8-like byte slice and return the
1649
+ /// index of the next code point.
1650
+ #[ inline]
1651
+ #[ unstable]
1652
+ pub fn char_range_at_raw ( bytes : & [ u8 ] , i : uint ) -> ( u32 , usize ) {
1653
+ if bytes[ i] < 128u8 {
1654
+ return ( bytes[ i] as u32 , i + 1 ) ;
1655
+ }
1656
+
1657
+ // Multibyte case is a fn to allow char_range_at to inline cleanly
1658
+ fn multibyte_char_range_at ( bytes : & [ u8 ] , i : uint ) -> ( u32 , usize ) {
1659
+ let mut val = bytes[ i] as u32 ;
1660
+ let w = UTF8_CHAR_WIDTH [ val as uint ] as uint ;
1661
+ assert ! ( ( w != 0 ) ) ;
1662
+
1663
+ val = utf8_first_byte ! ( val, w) ;
1664
+ val = utf8_acc_cont_byte ! ( val, bytes[ i + 1 ] ) ;
1665
+ if w > 2 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 2 ] ) ; }
1666
+ if w > 3 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 3 ] ) ; }
1667
+
1668
+ return ( val, i + w) ;
1669
+ }
1670
+
1671
+ multibyte_char_range_at ( bytes, i)
1672
+ }
1673
+
1656
1674
#[ stable]
1657
1675
impl < ' a > Default for & ' a str {
1658
1676
#[ stable]
0 commit comments