From 8febb721d5284555d9db5797bd3b07e5c9f7b466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Mei=C3=9Fner?= <AlexanderMeissner@gmx.net>
Date: Fri, 2 Jul 2021 05:35:56 +0200
Subject: [PATCH] strtab: Parse Strtab only once on Creation (#275)

* Parse Strtab immediately when created,
instead of on demand when entries are requested.
* Renames from_slice_unsafe => from_slice_unparsed
and updates comments.
---
 src/archive/mod.rs |   8 +--
 src/elf/dynamic.rs |   2 +-
 src/elf/mod.rs     |   6 +-
 src/pe/symbol.rs   |   7 +-
 src/strtab.rs      | 176 +++++++++++++++++++++++++++++++--------------
 tests/elf.rs       |   2 +-
 6 files changed, 132 insertions(+), 69 deletions(-)

diff --git a/src/archive/mod.rs b/src/archive/mod.rs
index 04e8037e..bfe09ccf 100644
--- a/src/archive/mod.rs
+++ b/src/archive/mod.rs
@@ -282,8 +282,8 @@ impl<'a> Index<'a> {
             let string_offset: u32 = buffer.pread_with(i * 8 + 4, scroll::LE)?;
             let archive_member: u32 = buffer.pread_with(i * 8 + 8, scroll::LE)?;
 
-            let string = match strtab.get(string_offset as usize) {
-                Some(result) => result,
+            let string = match strtab.get_at(string_offset as usize) {
+                Some(result) => Ok(result),
                 None => Err(Error::Malformed(format!(
                     "{} entry {} has string offset {}, which is out of bounds",
                     BSD_SYMDEF_NAME, i, string_offset
@@ -353,8 +353,8 @@ impl<'a> NameIndex<'a> {
         let idx = name.trim_start_matches('/').trim_end();
         match usize::from_str_radix(idx, 10) {
             Ok(idx) => {
-                let name = match self.strtab.get(idx + 1) {
-                    Some(result) => result,
+                let name = match self.strtab.get_at(idx + 1) {
+                    Some(result) => Ok(result),
                     None => Err(Error::Malformed(format!(
                         "Name {} is out of range in archive NameIndex",
                         name
diff --git a/src/elf/dynamic.rs b/src/elf/dynamic.rs
index 5425fb9d..8de3b77e 100644
--- a/src/elf/dynamic.rs
+++ b/src/elf/dynamic.rs
@@ -453,7 +453,7 @@ if_alloc! {
             let mut needed = Vec::with_capacity(count);
             for dynamic in &self.dyns {
                 if dynamic.d_tag as u64 == DT_NEEDED {
-                    if let Some(Ok(lib)) = strtab.get(dynamic.d_val as usize) {
+                    if let Some(lib) = strtab.get_at(dynamic.d_val as usize) {
                         needed.push(lib)
                     } else {
                         warn!("Invalid DT_NEEDED {}", dynamic.d_val)
diff --git a/src/elf/mod.rs b/src/elf/mod.rs
index be86cf11..9101d95e 100644
--- a/src/elf/mod.rs
+++ b/src/elf/mod.rs
@@ -174,9 +174,7 @@ if_sylvan! {
                     continue;
                 }
 
-                if section_name.is_some() && !self.shdr_strtab
-                    .get(sect.sh_name)
-                    .map_or(false, |r| r.ok() == section_name) {
+                if section_name.is_some() && self.shdr_strtab.get_at(sect.sh_name) != section_name {
                     continue;
                 }
 
@@ -298,7 +296,7 @@ if_sylvan! {
 
                 if dyn_info.soname != 0 {
                     // FIXME: warn! here
-                    soname = match dynstrtab.get(dyn_info.soname) { Some(Ok(soname)) => Some(soname), _ => None };
+                    soname = dynstrtab.get_at(dyn_info.soname);
                 }
                 if dyn_info.needed_count > 0 {
                     libraries = dynamic.get_libraries(&dynstrtab);
diff --git a/src/pe/symbol.rs b/src/pe/symbol.rs
index 26fa794a..a4e36514 100644
--- a/src/pe/symbol.rs
+++ b/src/pe/symbol.rs
@@ -218,11 +218,8 @@ impl Symbol {
     /// a strtab entry.
     pub fn name<'a>(&'a self, strtab: &'a strtab::Strtab) -> error::Result<&'a str> {
         if let Some(offset) = self.name_offset() {
-            strtab.get(offset as usize).unwrap_or_else(|| {
-                Err(error::Error::Malformed(format!(
-                    "Invalid Symbol name offset {:#x}",
-                    offset
-                )))
+            strtab.get_at(offset as usize).ok_or_else(|| {
+                error::Error::Malformed(format!("Invalid Symbol name offset {:#x}", offset))
             })
         } else {
             Ok(self.name.pread(0)?)
diff --git a/src/strtab.rs b/src/strtab.rs
index b2e8c308..7d77e2b0 100644
--- a/src/strtab.rs
+++ b/src/strtab.rs
@@ -3,7 +3,6 @@
 
 use core::fmt;
 use core::ops::Index;
-use core::slice;
 use core::str;
 use scroll::{ctx, Pread};
 if_alloc! {
@@ -15,8 +14,10 @@ if_alloc! {
 /// member index). Constructed using [`parse`](#method.parse)
 /// with your choice of delimiter. Please be careful.
 pub struct Strtab<'a> {
-    bytes: &'a [u8],
     delim: ctx::StrCtx,
+    bytes: &'a [u8],
+    #[cfg(feature = "alloc")]
+    strings: Vec<(usize, &'a str)>,
 }
 
 #[inline(always)]
@@ -25,31 +26,34 @@ fn get_str(offset: usize, bytes: &[u8], delim: ctx::StrCtx) -> scroll::Result<&s
 }
 
 impl<'a> Strtab<'a> {
-    /// Construct a new strtab with `bytes` as the backing string table, using `delim` as the delimiter between entries
-    pub fn new(bytes: &'a [u8], delim: u8) -> Self {
-        Strtab {
+    /// Creates a `Strtab` directly without bounds check and without parsing it.
+    ///
+    /// This is potentially unsafe and should only be used if `feature = "alloc"` is disabled.
+    pub fn from_slice_unparsed(bytes: &'a [u8], offset: usize, len: usize, delim: u8) -> Self {
+        Self {
             delim: ctx::StrCtx::Delimiter(delim),
-            bytes,
+            bytes: &bytes[offset..offset + len],
+            #[cfg(feature = "alloc")]
+            strings: Vec::new(),
         }
     }
-    /// Construct a strtab from a `ptr`, and a `size`, using `delim` as the delimiter
-    /// # Safety
+    /// Gets a str reference from the backing bytes starting at byte `offset`.
     ///
-    /// This function creates a `Strtab` directly from a raw pointer and size
-    pub unsafe fn from_raw(ptr: *const u8, size: usize, delim: u8) -> Strtab<'a> {
-        Strtab {
-            delim: ctx::StrCtx::Delimiter(delim),
-            bytes: slice::from_raw_parts(ptr, size),
+    /// If the index is out of bounds, `None` is returned. Panics if bytes are invalid UTF-8.
+    /// Use this method if the `Strtab` was created using `from_slice_unparsed()`.
+    pub fn get_unsafe(&self, offset: usize) -> Option<&'a str> {
+        if offset >= self.bytes.len() {
+            None
+        } else {
+            Some(get_str(offset, self.bytes, self.delim).unwrap())
         }
     }
     #[cfg(feature = "alloc")]
-    /// Parses a strtab from `bytes` at `offset` with `len` size as the backing string table, using `delim` as the delimiter
-    pub fn parse(
-        bytes: &'a [u8],
-        offset: usize,
-        len: usize,
-        delim: u8,
-    ) -> error::Result<Strtab<'a>> {
+    /// Parses a `Strtab` from `bytes` at `offset` with `len` size as the backing string table, using `delim` as the delimiter.
+    ///
+    /// Errors if bytes are invalid UTF-8.
+    /// Requires `feature = "alloc"`
+    pub fn parse(bytes: &'a [u8], offset: usize, len: usize, delim: u8) -> error::Result<Self> {
         let (end, overflow) = offset.overflowing_add(len);
         if overflow || end > bytes.len() {
             return Err(error::Error::Malformed(format!(
@@ -60,28 +64,74 @@ impl<'a> Strtab<'a> {
                 overflow
             )));
         }
-        Ok(Strtab {
-            bytes: &bytes[offset..end],
-            delim: ctx::StrCtx::Delimiter(delim),
-        })
+        let mut result = Self::from_slice_unparsed(bytes, offset, len, delim);
+        let mut i = 0;
+        while i < result.bytes.len() {
+            let string = get_str(i, result.bytes, result.delim)?;
+            result.strings.push((i, string));
+            i += string.len() + 1;
+        }
+        Ok(result)
+    }
+    #[cfg(feature = "alloc")]
+    /// Parses a `Strtab` with `bytes` as the backing string table, using `delim` as the delimiter between entries.
+    ///
+    /// Requires `feature = "alloc"`
+    pub fn new(bytes: &'a [u8], delim: u8) -> error::Result<Self> {
+        Self::parse(bytes, 0, bytes.len(), delim)
     }
     #[cfg(feature = "alloc")]
-    /// Converts the string table to a vector, with the original `delim` used to separate the strings
+    /// Converts the string table to a vector of parsed strings.
+    ///
+    /// Requires `feature = "alloc"`
     pub fn to_vec(&self) -> error::Result<Vec<&'a str>> {
-        let len = self.bytes.len();
-        let mut strings = Vec::with_capacity(len);
-        let mut i = 0;
-        while i < len {
-            let string = self.get(i).unwrap()?;
-            i = i + string.len() + 1;
-            strings.push(string);
+        // Fallback in case `Strtab` was created using `from_slice_unparsed()`.
+        if self.strings.is_empty() {
+            let mut result = Vec::new();
+            let mut i = 0;
+            while i < self.bytes.len() {
+                let string = get_str(i, self.bytes, self.delim)?;
+                result.push(string);
+                i += string.len() + 1;
+            }
+            return Ok(result);
         }
-        Ok(strings)
+        Ok(self.strings.iter().map(|&(_key, value)| value).collect())
     }
-    /// Safely parses and gets a str reference from the backing bytes starting at byte `offset`.
+    #[cfg(feature = "alloc")]
+    /// Safely gets a str reference from the parsed table starting at byte `offset`.
+    ///
     /// If the index is out of bounds, `None` is returned.
     /// Requires `feature = "alloc"`
+    pub fn get_at(&self, offset: usize) -> Option<&'a str> {
+        match self
+            .strings
+            .binary_search_by_key(&offset, |&(key, _value)| key)
+        {
+            Ok(index) => Some(self.strings[index].1),
+            Err(index) => {
+                if index == 0 {
+                    return None;
+                }
+                let (string_begin_offset, entire_string) = self.strings[index - 1];
+                entire_string.get(offset - string_begin_offset..)
+            }
+        }
+    }
+    #[deprecated(since = "0.4.2", note = "Use from_slice_unparsed() instead")]
+    /// Construct a strtab from a `ptr`, and a `size`, using `delim` as the delimiter
+    ///
+    /// # Safety
+    /// This function creates a `Strtab` directly from a raw pointer and size
+    pub unsafe fn from_raw(ptr: *const u8, len: usize, delim: u8) -> Strtab<'a> {
+        Self::from_slice_unparsed(core::slice::from_raw_parts(ptr, len), 0, len, delim)
+    }
+    #[deprecated(since = "0.4.2", note = "Bad performance, use get_at() instead")]
     #[cfg(feature = "alloc")]
+    /// Parses a str reference from the parsed table starting at byte `offset`.
+    ///
+    /// If the index is out of bounds, `None` is returned.
+    /// Requires `feature = "alloc"`
     pub fn get(&self, offset: usize) -> Option<error::Result<&'a str>> {
         if offset >= self.bytes.len() {
             None
@@ -89,15 +139,6 @@ impl<'a> Strtab<'a> {
             Some(get_str(offset, self.bytes, self.delim).map_err(core::convert::Into::into))
         }
     }
-    /// Gets a str reference from the backing bytes starting at byte `offset`.
-    /// If the index is out of bounds, `None` is returned. Panics if bytes are invalid UTF-8.
-    pub fn get_unsafe(&self, offset: usize) -> Option<&'a str> {
-        if offset >= self.bytes.len() {
-            None
-        } else {
-            Some(get_str(offset, self.bytes, self.delim).unwrap())
-        }
-    }
 }
 
 impl<'a> fmt::Debug for Strtab<'a> {
@@ -110,10 +151,12 @@ impl<'a> fmt::Debug for Strtab<'a> {
 }
 
 impl<'a> Default for Strtab<'a> {
-    fn default() -> Strtab<'a> {
-        Strtab {
-            bytes: &[],
+    fn default() -> Self {
+        Self {
             delim: ctx::StrCtx::default(),
+            bytes: &[],
+            #[cfg(feature = "alloc")]
+            strings: Vec::new(),
         }
     }
 }
@@ -133,8 +176,7 @@ impl<'a> Index<usize> for Strtab<'a> {
 
 #[test]
 fn as_vec_no_final_null() {
-    let bytes = b"\0printf\0memmove\0busta";
-    let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), 0x0) };
+    let strtab = Strtab::new(b"\0printf\0memmove\0busta", 0x0).unwrap();
     let vec = strtab.to_vec().unwrap();
     assert_eq!(vec.len(), 4);
     assert_eq!(vec, vec!["", "printf", "memmove", "busta"]);
@@ -142,8 +184,7 @@ fn as_vec_no_final_null() {
 
 #[test]
 fn as_vec_no_first_null_no_final_null() {
-    let bytes = b"printf\0memmove\0busta";
-    let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), 0x0) };
+    let strtab = Strtab::new(b"printf\0memmove\0busta", 0x0).unwrap();
     let vec = strtab.to_vec().unwrap();
     assert_eq!(vec.len(), 3);
     assert_eq!(vec, vec!["printf", "memmove", "busta"]);
@@ -151,8 +192,7 @@ fn as_vec_no_first_null_no_final_null() {
 
 #[test]
 fn to_vec_final_null() {
-    let bytes = b"\0printf\0memmove\0busta\0";
-    let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), 0x0) };
+    let strtab = Strtab::new(b"\0printf\0memmove\0busta\0", 0x0).unwrap();
     let vec = strtab.to_vec().unwrap();
     assert_eq!(vec.len(), 4);
     assert_eq!(vec, vec!["", "printf", "memmove", "busta"]);
@@ -160,9 +200,37 @@ fn to_vec_final_null() {
 
 #[test]
 fn to_vec_newline_delim() {
-    let bytes = b"\nprintf\nmemmove\nbusta\n";
-    let strtab = unsafe { Strtab::from_raw(bytes.as_ptr(), bytes.len(), b'\n') };
+    let strtab = Strtab::new(b"\nprintf\nmemmove\nbusta\n", b'\n').unwrap();
     let vec = strtab.to_vec().unwrap();
     assert_eq!(vec.len(), 4);
     assert_eq!(vec, vec!["", "printf", "memmove", "busta"]);
 }
+
+#[test]
+fn parse_utf8() {
+    assert!(match Strtab::new(&[0x80, 0x80], b'\n') {
+        Err(error::Error::Scroll(scroll::Error::BadInput {
+            size: 2,
+            msg: "invalid utf8",
+        })) => true,
+        _ => false,
+    });
+    assert!(match Strtab::new(&[0xC6, 0x92, 0x6F, 0x6F], b'\n') {
+        Ok(_) => true,
+        _ => false,
+    });
+}
+
+#[test]
+fn get_at_utf8() {
+    let strtab = Strtab::new("\nƒoo\nmemmove\n🅱️usta\n".as_bytes(), b'\n').unwrap();
+    assert_eq!(strtab.get_at(0), Some(""));
+    assert_eq!(strtab.get_at(5), Some(""));
+    assert_eq!(strtab.get_at(6), Some("memmove"));
+    assert_eq!(strtab.get_at(14), Some("\u{1f171}\u{fe0f}usta"));
+    assert_eq!(strtab.get_at(16), None);
+    assert_eq!(strtab.get_at(18), Some("\u{fe0f}usta"));
+    assert_eq!(strtab.get_at(21), Some("usta"));
+    assert_eq!(strtab.get_at(25), Some(""));
+    assert_eq!(strtab.get_at(26), None);
+}
diff --git a/tests/elf.rs b/tests/elf.rs
index f943efa7..3bd96f45 100644
--- a/tests/elf.rs
+++ b/tests/elf.rs
@@ -60,7 +60,7 @@ fn parse_text_section_size_lazy(base: &[u8]) -> Result<u64, &'static str> {
     )
     .map_err(|_| "parse string table error")?;
     for (_, section) in obj.section_headers.iter().enumerate() {
-        let section_name = strtab.get(section.sh_name).unwrap().unwrap();
+        let section_name = strtab.get_at(section.sh_name).unwrap();
         if section_name == ".text" {
             return Ok(section.sh_size);
         }