mirror of
https://github.com/helix-editor/helix.git
synced 2025-04-02 02:17:44 +03:00
Add a very small owned string type for the word index
`TinyBoxedStr` is a small-string optimized replacement for `Box<str>` styled after <https://cedardb.com/blog/german_strings/>. A nearly identical type is used in helix-editor/spellbook for space savings. This type is specialized for its use-case: * strings are immutable after creation * strings are very very small (less than 256 bytes long) Because of these attributes we can use nearly the full size of the type for the inline representation and also keep a small total size. Many other small string crates in the wild are 3 `usize`s long (same as a regular `String`) to support mutability. This type is more like a `Box<str>` (2 `usize`s long). Mostly I like this small string type though because it's very straightforward to implement. Other than a few functions that reach into `std::alloc` or `std::ptr`, the code is short and boring.
This commit is contained in:
parent
fafef15758
commit
b7186fef7e
3 changed files with 230 additions and 5 deletions
|
@ -3,5 +3,6 @@ pub mod faccess;
|
|||
pub mod path;
|
||||
pub mod range;
|
||||
pub mod rope;
|
||||
pub mod str;
|
||||
|
||||
pub use range::Range;
|
||||
|
|
217
helix-stdx/src/str.rs
Normal file
217
helix-stdx/src/str.rs
Normal file
|
@ -0,0 +1,217 @@
|
|||
use std::{
|
||||
alloc,
|
||||
borrow::Borrow,
|
||||
fmt, hash,
|
||||
mem::{size_of, ManuallyDrop},
|
||||
ptr::{self, NonNull},
|
||||
slice, str,
|
||||
};
|
||||
|
||||
/// A very very small owned string type.
|
||||
///
|
||||
/// This type is like a `Box<str>` but can only fit strings with a byte length smaller than 256.
|
||||
/// On 64-bit machines this type stores up to 15 bytes inline (7 bytes on 32-bit machines). One
|
||||
/// byte is used to store the length. For strings short enough to be stored inline, the remaining
|
||||
/// 15 (or 7) bytes store the content inline. Otherwise the second `usize` of memory is a thin
|
||||
/// pointer to the string content.
|
||||
///
|
||||
/// Unlike `Box<str>` this type is not null-pointer optimized.
|
||||
#[repr(C)]
|
||||
pub struct TinyBoxedStr {
|
||||
len: u8,
|
||||
prefix: [u8; Self::PREFIX_LEN],
|
||||
trailing: TinyStrTrailing,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
union TinyStrTrailing {
|
||||
suffix: [u8; TinyBoxedStr::SUFFIX_LEN],
|
||||
ptr: ManuallyDrop<NonNull<u8>>,
|
||||
}
|
||||
|
||||
impl TinyBoxedStr {
|
||||
const PREFIX_LEN: usize = size_of::<usize>() - size_of::<u8>();
|
||||
const SUFFIX_LEN: usize = size_of::<usize>();
|
||||
const INLINE_LEN: u8 = (Self::PREFIX_LEN + Self::SUFFIX_LEN) as u8;
|
||||
|
||||
pub const MAX_LEN: usize = u8::MAX as usize;
|
||||
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
let ptr = if self.len <= Self::INLINE_LEN {
|
||||
let ptr = ptr::from_ref(self);
|
||||
unsafe { ptr::addr_of!((*ptr).prefix) }.cast()
|
||||
} else {
|
||||
unsafe { self.trailing.ptr }.as_ptr()
|
||||
};
|
||||
unsafe { slice::from_raw_parts(ptr, self.len()) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn as_str(&self) -> &str {
|
||||
unsafe { str::from_utf8_unchecked(self.as_bytes()) }
|
||||
}
|
||||
|
||||
fn layout(len: usize) -> alloc::Layout {
|
||||
alloc::Layout::array::<u8>(len)
|
||||
.expect("a valid layout for an array")
|
||||
.pad_to_align()
|
||||
}
|
||||
|
||||
fn copy_bytes(source: &[u8]) -> NonNull<u8> {
|
||||
let layout = Self::layout(source.len());
|
||||
let nullable = unsafe { alloc::alloc(layout) };
|
||||
let ptr = match NonNull::new(nullable) {
|
||||
Some(ptr) => ptr.cast(),
|
||||
None => alloc::handle_alloc_error(layout),
|
||||
};
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(source.as_ptr(), ptr.as_ptr(), source.len());
|
||||
}
|
||||
ptr
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TooLongError;
|
||||
|
||||
impl fmt::Display for TooLongError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("string was too long to be stored as a `TinyBoxedStr` (max 256 bytes)")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TooLongError {}
|
||||
|
||||
impl TryFrom<&str> for TinyBoxedStr {
|
||||
type Error = TooLongError;
|
||||
|
||||
fn try_from(s: &str) -> Result<Self, Self::Error> {
|
||||
if s.len() > Self::MAX_LEN {
|
||||
return Err(TooLongError);
|
||||
}
|
||||
|
||||
let len = s.len() as u8;
|
||||
let bytes = s.as_bytes();
|
||||
let mut prefix = [0; Self::PREFIX_LEN];
|
||||
let trailing = if len <= Self::INLINE_LEN {
|
||||
let mut suffix = [0; Self::SUFFIX_LEN];
|
||||
if s.len() <= Self::PREFIX_LEN {
|
||||
prefix[..s.len()].copy_from_slice(bytes);
|
||||
} else {
|
||||
prefix.copy_from_slice(&bytes[..Self::PREFIX_LEN]);
|
||||
suffix[..s.len() - Self::PREFIX_LEN].copy_from_slice(&bytes[Self::PREFIX_LEN..]);
|
||||
}
|
||||
TinyStrTrailing { suffix }
|
||||
} else {
|
||||
prefix.copy_from_slice(&bytes[..Self::PREFIX_LEN]);
|
||||
let ptr = ManuallyDrop::new(Self::copy_bytes(bytes));
|
||||
TinyStrTrailing { ptr }
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
len,
|
||||
prefix,
|
||||
trailing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TinyBoxedStr {
|
||||
fn drop(&mut self) {
|
||||
if self.len > Self::INLINE_LEN {
|
||||
let ptr = unsafe { self.trailing.ptr }.as_ptr();
|
||||
let layout = Self::layout(self.len());
|
||||
unsafe { alloc::dealloc(ptr.cast(), layout) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TinyBoxedStr {
|
||||
fn clone(&self) -> Self {
|
||||
let trailing = if self.len <= Self::INLINE_LEN {
|
||||
let suffix = unsafe { self.trailing.suffix };
|
||||
TinyStrTrailing { suffix }
|
||||
} else {
|
||||
let ptr = ManuallyDrop::new(Self::copy_bytes(self.as_bytes()));
|
||||
TinyStrTrailing { ptr }
|
||||
};
|
||||
|
||||
Self {
|
||||
len: self.len,
|
||||
prefix: self.prefix,
|
||||
trailing,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TinyBoxedStr {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
len: 0,
|
||||
prefix: [0; Self::PREFIX_LEN],
|
||||
trailing: TinyStrTrailing {
|
||||
suffix: [0; Self::SUFFIX_LEN],
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for TinyBoxedStr {
|
||||
fn as_ref(&self) -> &str {
|
||||
self.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<str> for TinyBoxedStr {
|
||||
fn borrow(&self) -> &str {
|
||||
self.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: this could be specialized to optimize the number of comparison operations. We could cast
|
||||
// the first `usize` of memory together to do a single comparison (and same for the suffixes).
|
||||
// This optimization would only matter if we compared these strings very frequently however.
|
||||
impl PartialEq for TinyBoxedStr {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_str() == other.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for TinyBoxedStr {}
|
||||
|
||||
impl PartialEq<str> for TinyBoxedStr {
|
||||
fn eq(&self, other: &str) -> bool {
|
||||
self.as_str() == other
|
||||
}
|
||||
}
|
||||
|
||||
impl hash::Hash for TinyBoxedStr {
|
||||
fn hash<H: hash::Hasher>(&self, state: &mut H) {
|
||||
self.as_str().hash(state)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for TinyBoxedStr {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.as_str().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for TinyBoxedStr {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.as_str().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Send for TinyBoxedStr {}
|
||||
unsafe impl Sync for TinyBoxedStr {}
|
|
@ -123,8 +123,7 @@ const MIN_WORD_GRAPHEMES: usize = 3;
|
|||
/// Maximum word length allowed (in chars)
|
||||
const MAX_WORD_LEN: usize = 50;
|
||||
|
||||
// TODO: choose or create a suitable small string type.
|
||||
type Word = String;
|
||||
type Word = helix_stdx::str::TinyBoxedStr;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct WordIndexInner {
|
||||
|
@ -142,11 +141,16 @@ impl WordIndexInner {
|
|||
}
|
||||
|
||||
fn insert(&mut self, word: RopeSlice) {
|
||||
assert!(word.len_chars() <= MAX_WORD_LEN);
|
||||
// The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50
|
||||
// characters and characters take at most four bytes.
|
||||
assert!(word.len_bytes() < Word::MAX_LEN);
|
||||
|
||||
let word: Cow<str> = word.into();
|
||||
if let Some(rc) = self.words.get_mut(word.as_ref()) {
|
||||
*rc = rc.saturating_add(1);
|
||||
} else {
|
||||
self.words.insert(word.into_owned(), 1);
|
||||
self.words.insert(word.as_ref().try_into().unwrap(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -172,7 +176,10 @@ impl WordIndex {
|
|||
let inner = self.inner.read();
|
||||
let mut matches = fuzzy_match(pattern, inner.words(), false);
|
||||
matches.sort_unstable_by_key(|(_, score)| *score);
|
||||
matches.into_iter().map(|(word, _)| word.clone()).collect()
|
||||
matches
|
||||
.into_iter()
|
||||
.map(|(word, _)| word.to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn add_document(&self, text: &Rope) {
|
||||
|
@ -409,7 +416,7 @@ mod tests {
|
|||
impl WordIndex {
|
||||
fn words(&self) -> HashSet<String> {
|
||||
let inner = self.inner.read();
|
||||
inner.words().cloned().collect()
|
||||
inner.words().map(|w| w.to_string()).collect()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue