rust-lang · wesleywiser · Oct 5, 2020 · Oct 1, 2020 · Oct 1, 2020 · Oct 1, 2020
diff --git a/analyzeme/src/stringtable.rs b/analyzeme/src/stringtable.rs
@@ -1,12 +1,16 @@
 //! See module-level documentation `measureme::stringtable`.
 
-use measureme::file_header::{
-    strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
-    FILE_MAGIC_STRINGTABLE_INDEX,
+use measureme::stringtable::{METADATA_STRING_ID, TERMINATOR};
+use measureme::{
+    file_header::{
+        strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
+        FILE_MAGIC_STRINGTABLE_INDEX,
+    },
+    stringtable::STRING_REF_ENCODED_SIZE,
+    stringtable::STRING_REF_TAG,
 };
-use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
 use measureme::{Addr, StringId};
-use memchr::memchr;
+use memchr::{memchr, memchr2};
 use rustc_hash::FxHashMap;
 use std::borrow::Cow;
 use std::convert::TryInto;
@@ -30,6 +34,10 @@ pub struct StringRef<'st> {
 // be resolved.
 const UNKNOWN_STRING: &str = "<unknown>";
 
+// This is the text we emit when we encounter string data that does not have a
+// proper terminator.
+const INVALID_STRING: &str = "<invalid>";
+
 impl<'st> StringRef<'st> {
     /// Expands the StringRef into an actual string. This method will
     /// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -55,9 +63,8 @@ impl<'st> StringRef<'st> {
 
         // Check if this is a string containing a single StringId component
         let first_byte = self.table.string_data[pos];
-        const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
-        if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
-            let id = decode_string_id_from_data(&self.table.string_data[pos..pos + STRING_ID_SIZE]);
+        if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
+            let id = decode_string_ref_from_data(&self.table.string_data[pos..]);
             return StringRef {
                 id,
                 table: self.table,
@@ -97,19 +104,28 @@ impl<'st> StringRef<'st> {
 
             if byte == TERMINATOR {
                 return;
-            } else if is_utf8_continuation_byte(byte) {
+            } else if byte == STRING_REF_TAG {
                 let string_ref = StringRef {
-                    id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
+                    id: decode_string_ref_from_data(&self.table.string_data[pos..]),
                     table: self.table,
                 };
 
                 string_ref.write_to_string(output);
 
-                pos += 4;
+                pos += STRING_REF_ENCODED_SIZE;
             } else {
-                while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
-                    output.push(c);
+                // This is a literal UTF-8 string value. Find its end by looking
+                // for either of the two possible terminator bytes.
+                let remaining_data = &self.table.string_data[pos..];
+                if let Some(len) = memchr2(0xFF, 0xFE, remaining_data) {
+                    let value = String::from_utf8_lossy(&remaining_data[..len]);
+                    output.push_str(&value);
                     pos += len;
+                } else {
+                    // The grammar does not allow unterminated raw strings. We
+                    // have to stop decoding.
+                    output.push_str(INVALID_STRING);
+                    return;
                 }
             }
         }
@@ -129,71 +145,17 @@ impl<'st> StringRef<'st> {
     }
 }
 
-fn is_utf8_continuation_byte(byte: u8) -> bool {
-    // See module-level documentation for more information on the encoding.
-    const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
-    const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
-    (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
-}
-
 // String IDs in the table data are encoded in big endian format, while string
 // IDs in the index are encoded in little endian format. Don't mix the two up.
-fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
-    let id = u32::from_be_bytes(bytes[0..4].try_into().unwrap());
-    // Mask off the `0b10` prefix
-    StringId::new(id & STRING_ID_MASK)
-}
-
-// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
-// Returns the decoded `char` and its size in bytes if it succeeds.
-// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
-// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
-// encoding.
-fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
-    use std::convert::TryFrom;
-    let first_byte = bytes[0] as u32;
-    let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
-        // The highest bit is zero, so this is a single-byte char
-        (first_byte, 1)
-    } else if (first_byte & 0b1110_0000) == 0b1100_0000 {
-        // This is a two byte character
-        let bits0 = first_byte & 0b0001_1111;
-        let bits1 = (bytes[1] & 0b0011_1111) as u32;
-
-        (bits0 << 6 | bits1, 2)
-    } else if (first_byte & 0b1111_0000) == 0b1110_0000 {
-        // This is a three byte character
-        let bits0 = first_byte & 0b0000_1111;
-        let bits1 = (bytes[1] & 0b0011_1111) as u32;
-        let bits2 = (bytes[2] & 0b0011_1111) as u32;
-
-        ((bits0 << 12) | (bits1 << 6) | bits2, 3)
-    } else if (first_byte & 0b1111_1000) == 0b1111_0000 {
-        // This is a four byte character
-        let bits0 = first_byte & 0b0000_0111;
-        let bits1 = (bytes[1] & 0b0011_1111) as u32;
-        let bits2 = (bytes[2] & 0b0011_1111) as u32;
-        let bits3 = (bytes[3] & 0b0011_1111) as u32;
-
-        ((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
-    } else {
-        return None;
-    };
-
-    match char::try_from(codepoint) {
-        Ok(c) => {
-            debug_assert!({
-                let test_bytes = &mut [0u8; 8];
-                c.encode_utf8(test_bytes);
-                &test_bytes[..len] == &bytes[..len]
-            });
-
-            Some((c, len))
-        }
-        Err(e) => {
-            panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
-        }
-    }
+fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
+    // The code below assumes we use a 5-byte encoding for string
+    // refs, where the first byte is STRING_REF_TAG and the
+    // following 4 bytes are a little-endian u32 string ID value.
+    assert!(bytes[0] == STRING_REF_TAG);
+    assert!(STRING_REF_ENCODED_SIZE == 5);
+
+    let id = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
+    StringId::new(id)
 }
 
 /// Read-only version of the string table
@@ -346,20 +308,4 @@ mod tests {
             assert_eq!(str_ref.to_string(), write_to);
         }
     }
-
-    #[test]
-    fn utf8_char_decoding() {
-        use std::convert::TryFrom;
-
-        // Let's just test all possible codepoints because there are not that
-        // many actually.
-        for codepoint in 0..=0x10FFFFu32 {
-            if let Ok(expected_char) = char::try_from(codepoint) {
-                let buffer = &mut [0; 4];
-                let expected_len = expected_char.encode_utf8(buffer).len();
-                let expected = Some((expected_char, expected_len));
-                assert_eq!(expected, decode_utf8_char(&buffer[..]));
-            }
-        }
-    }
 }
diff --git a/measureme/src/file_header.rs b/measureme/src/file_header.rs
@@ -5,7 +5,7 @@ use std::convert::TryInto;
 use std::error::Error;
 use std::path::Path;
 
-pub const CURRENT_FILE_FORMAT_VERSION: u32 = 6;
+pub const CURRENT_FILE_FORMAT_VERSION: u32 = 7;
 
 pub const FILE_MAGIC_TOP_LEVEL: &[u8; 4] = b"MMPD";
 pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";

diff --git a/measureme/src/stringtable.rs b/measureme/src/stringtable.rs
@@ -15,27 +15,24 @@
 //! The byte-level encoding of component lists uses the structure of UTF-8 in
 //! order to save space:
 //!
-//! - A valid UTF-8 codepoint never starts with the bits `10` as this bit
-//!   prefix is reserved for bytes in the middle of a UTF-8 codepoint byte
-//!   sequence. We make use of this fact by letting all string ID components
-//!   start with this `10` prefix. Thus when we parse the contents of a value
-//!   we know to stop if the start byte of the next codepoint has this prefix.
+//! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
+//!   of this fact by letting all string ID components start with this `0xFE`
+//!   prefix. Thus when we parse the contents of a value we know to stop if
+//!   we encounter this byte.
 //!
-//! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs
-//!   start with `10` as described above, they also cannot start with a `0xFF`
-//!   byte. Thus we can safely use `0xFF` as our component list terminator.
+//! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
+//!   use `0xFF` as our component list terminator.
 //!
 //! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
 //! encoded as:
 //!
 //! ```ignore
-//!     ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255]
-//!                      ^^^^^^^^^^^^^                 ^^^
-//!              string ID 42 with 0b10 prefix        terminator (0xFF)
+//!     ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
+//!                      ^^^^^^^^^^^^^^^^                 ^^^
+//!                 string ID with 0xFE prefix      terminator (0xFF)
 //! ```
 //!
-//! As you can see string IDs are encoded in big endian format so that highest
-//! order bits show up in the first byte we encounter.
+//! As you can see string IDs are encoded in little endian format.
 //!
 //! ----------------------------------------------------------------------------
 //!
@@ -58,10 +55,10 @@
 //! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
 //!
 //! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
-//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used
-//! internally by `measureme` to record additional metadata about the profiling session.
-//! After `METADATA_STRING_ID` are all other `StringId` values.
-//!
+//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
+//! which is used internally by `measureme` to record additional metadata about
+//! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
+//! values.
 
 use crate::file_header::{
     write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
@@ -84,7 +81,6 @@ impl StringId {
 
     #[inline]
     pub fn new(id: u32) -> StringId {
-        assert!(id <= MAX_STRING_ID);
         StringId(id)
     }
 
@@ -106,23 +102,20 @@ impl StringId {
 
     #[inline]
     pub fn from_addr(addr: Addr) -> StringId {
-        let id = addr.0 + FIRST_REGULAR_STRING_ID;
+        let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
         StringId::new(id)
     }
 
     #[inline]
     pub fn to_addr(self) -> Addr {
-        assert!(self.0 >= FIRST_REGULAR_STRING_ID);
-        Addr(self.0 - FIRST_REGULAR_STRING_ID)
+        Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
     }
 }
 
 // See module-level documentation for more information on the encoding.
 pub const TERMINATOR: u8 = 0xFF;
-
-// All 1s except for the two highest bits.
-pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF;
-pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF;
+pub const STRING_REF_TAG: u8 = 0xFE;
+pub const STRING_REF_ENCODED_SIZE: usize = 5;
 
 /// The maximum id value a virtual string may be.
 const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000;
@@ -175,7 +168,7 @@ impl<'s> StringComponent<'s> {
     fn serialized_size(&self) -> usize {
         match *self {
             StringComponent::Value(s) => s.len(),
-            StringComponent::Ref(_) => 4,
+            StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
         }
     }
 
@@ -187,11 +180,14 @@ impl<'s> StringComponent<'s> {
                 &mut bytes[s.len()..]
             }
             StringComponent::Ref(string_id) => {
-                assert!(string_id.0 == string_id.0 & STRING_ID_MASK);
-                let tagged = string_id.0 | (1u32 << 31);
-
-                &mut bytes[0..4].copy_from_slice(&tagged.to_be_bytes());
-                &mut bytes[4..]
+                // The code below assumes we use a 5-byte encoding for string
+                // refs, where the first byte is STRING_REF_TAG and the
+                // following 4 bytes are a little-endian u32 string ID value.
+                assert!(STRING_REF_ENCODED_SIZE == 5);
+
+                bytes[0] = STRING_REF_TAG;
+                &mut bytes[1..5].copy_from_slice(&string_id.0.to_le_bytes());
+                &mut bytes[5..]
             }
         }
     }