diff --git a/helix-stdx/src/lib.rs b/helix-stdx/src/lib.rs index d09df587a..93cc8304a 100644 --- a/helix-stdx/src/lib.rs +++ b/helix-stdx/src/lib.rs @@ -3,5 +3,6 @@ pub mod faccess; pub mod path; pub mod range; pub mod rope; +pub mod str; pub use range::Range; diff --git a/helix-stdx/src/str.rs b/helix-stdx/src/str.rs new file mode 100644 index 000000000..a9203f3b8 --- /dev/null +++ b/helix-stdx/src/str.rs @@ -0,0 +1,217 @@ +use std::{ + alloc, + borrow::Borrow, + fmt, hash, + mem::{size_of, ManuallyDrop}, + ptr::{self, NonNull}, + slice, str, +}; + +/// A very very small owned string type. +/// +/// This type is like a `Box` but can only fit strings with a byte length smaller than 256. +/// On 64-bit machines this type stores up to 15 bytes inline (7 bytes on 32-bit machines). One +/// byte is used to store the length. For strings short enough to be stored inline, the remaining +/// 15 (or 7) bytes store the content inline. Otherwise the second `usize` of memory is a thin +/// pointer to the string content. +/// +/// Unlike `Box` this type is not null-pointer optimized. +#[repr(C)] +pub struct TinyBoxedStr { + len: u8, + prefix: [u8; Self::PREFIX_LEN], + trailing: TinyStrTrailing, +} + +#[repr(C)] +union TinyStrTrailing { + suffix: [u8; TinyBoxedStr::SUFFIX_LEN], + ptr: ManuallyDrop>, +} + +impl TinyBoxedStr { + const PREFIX_LEN: usize = size_of::() - size_of::(); + const SUFFIX_LEN: usize = size_of::(); + const INLINE_LEN: u8 = (Self::PREFIX_LEN + Self::SUFFIX_LEN) as u8; + + pub const MAX_LEN: usize = u8::MAX as usize; + + #[inline] + pub fn len(&self) -> usize { + self.len as usize + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn as_bytes(&self) -> &[u8] { + let ptr = if self.len <= Self::INLINE_LEN { + let ptr = ptr::from_ref(self); + unsafe { ptr::addr_of!((*ptr).prefix) }.cast() + } else { + unsafe { self.trailing.ptr }.as_ptr() + }; + unsafe { slice::from_raw_parts(ptr, self.len()) } + } + + #[inline] + pub fn as_str(&self) -> &str { + unsafe { str::from_utf8_unchecked(self.as_bytes()) } + } + + fn layout(len: usize) -> alloc::Layout { + alloc::Layout::array::(len) + .expect("a valid layout for an array") + .pad_to_align() + } + + fn copy_bytes(source: &[u8]) -> NonNull { + let layout = Self::layout(source.len()); + let nullable = unsafe { alloc::alloc(layout) }; + let ptr = match NonNull::new(nullable) { + Some(ptr) => ptr.cast(), + None => alloc::handle_alloc_error(layout), + }; + unsafe { + ptr::copy_nonoverlapping(source.as_ptr(), ptr.as_ptr(), source.len()); + } + ptr + } +} + +#[derive(Debug)] +pub struct TooLongError; + +impl fmt::Display for TooLongError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("string was too long to be stored as a `TinyBoxedStr` (max 256 bytes)") + } +} + +impl std::error::Error for TooLongError {} + +impl TryFrom<&str> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: &str) -> Result { + if s.len() > Self::MAX_LEN { + return Err(TooLongError); + } + + let len = s.len() as u8; + let bytes = s.as_bytes(); + let mut prefix = [0; Self::PREFIX_LEN]; + let trailing = if len <= Self::INLINE_LEN { + let mut suffix = [0; Self::SUFFIX_LEN]; + if s.len() <= Self::PREFIX_LEN { + prefix[..s.len()].copy_from_slice(bytes); + } else { + prefix.copy_from_slice(&bytes[..Self::PREFIX_LEN]); + suffix[..s.len() - Self::PREFIX_LEN].copy_from_slice(&bytes[Self::PREFIX_LEN..]); + } + TinyStrTrailing { suffix } + } else { + prefix.copy_from_slice(&bytes[..Self::PREFIX_LEN]); + let ptr = ManuallyDrop::new(Self::copy_bytes(bytes)); + TinyStrTrailing { ptr } + }; + + Ok(Self { + len, + prefix, + trailing, + }) + } +} + +impl Drop for TinyBoxedStr { + fn drop(&mut self) { + if self.len > Self::INLINE_LEN { + let ptr = unsafe { self.trailing.ptr }.as_ptr(); + let layout = Self::layout(self.len()); + unsafe { alloc::dealloc(ptr.cast(), layout) } + } + } +} + +impl Clone for TinyBoxedStr { + fn clone(&self) -> Self { + let trailing = if self.len <= Self::INLINE_LEN { + let suffix = unsafe { self.trailing.suffix }; + TinyStrTrailing { suffix } + } else { + let ptr = ManuallyDrop::new(Self::copy_bytes(self.as_bytes())); + TinyStrTrailing { ptr } + }; + + Self { + len: self.len, + prefix: self.prefix, + trailing, + } + } +} + +impl Default for TinyBoxedStr { + fn default() -> Self { + Self { + len: 0, + prefix: [0; Self::PREFIX_LEN], + trailing: TinyStrTrailing { + suffix: [0; Self::SUFFIX_LEN], + }, + } + } +} + +impl AsRef for TinyBoxedStr { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for TinyBoxedStr { + fn borrow(&self) -> &str { + self.as_str() + } +} + +// NOTE: this could be specialized to optimize the number of comparison operations. We could cast +// the first `usize` of memory together to do a single comparison (and same for the suffixes). +// This optimization would only matter if we compared these strings very frequently however. +impl PartialEq for TinyBoxedStr { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for TinyBoxedStr {} + +impl PartialEq for TinyBoxedStr { + fn eq(&self, other: &str) -> bool { + self.as_str() == other + } +} + +impl hash::Hash for TinyBoxedStr { + fn hash(&self, state: &mut H) { + self.as_str().hash(state) + } +} + +impl fmt::Debug for TinyBoxedStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.as_str().fmt(f) + } +} + +impl fmt::Display for TinyBoxedStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.as_str().fmt(f) + } +} + +unsafe impl Send for TinyBoxedStr {} +unsafe impl Sync for TinyBoxedStr {} diff --git a/helix-view/src/handlers/word_index.rs b/helix-view/src/handlers/word_index.rs index f84e77e28..39bb387fc 100644 --- a/helix-view/src/handlers/word_index.rs +++ b/helix-view/src/handlers/word_index.rs @@ -123,8 +123,7 @@ const MIN_WORD_GRAPHEMES: usize = 3; /// Maximum word length allowed (in chars) const MAX_WORD_LEN: usize = 50; -// TODO: choose or create a suitable small string type. -type Word = String; +type Word = helix_stdx::str::TinyBoxedStr; #[derive(Debug, Default)] struct WordIndexInner { @@ -142,11 +141,16 @@ impl WordIndexInner { } fn insert(&mut self, word: RopeSlice) { + assert!(word.len_chars() <= MAX_WORD_LEN); + // The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50 + // characters and characters take at most four bytes. + assert!(word.len_bytes() < Word::MAX_LEN); + let word: Cow = word.into(); if let Some(rc) = self.words.get_mut(word.as_ref()) { *rc = rc.saturating_add(1); } else { - self.words.insert(word.into_owned(), 1); + self.words.insert(word.as_ref().try_into().unwrap(), 1); } } @@ -172,7 +176,10 @@ impl WordIndex { let inner = self.inner.read(); let mut matches = fuzzy_match(pattern, inner.words(), false); matches.sort_unstable_by_key(|(_, score)| *score); - matches.into_iter().map(|(word, _)| word.clone()).collect() + matches + .into_iter() + .map(|(word, _)| word.to_string()) + .collect() } fn add_document(&self, text: &Rope) { @@ -409,7 +416,7 @@ mod tests { impl WordIndex { fn words(&self) -> HashSet { let inner = self.inner.read(); - inner.words().cloned().collect() + inner.words().map(|w| w.to_string()).collect() } }