From 20151a55946d21e69e06742216dffd6912b17ecf Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Sun, 26 Jan 2025 20:58:27 -0500 Subject: [PATCH] Move rope grapheme iterators from core to stdx --- helix-core/src/doc_formatter.rs | 6 +- helix-core/src/graphemes.rs | 158 +--------------------------- helix-core/src/indent.rs | 4 +- helix-core/src/lib.rs | 1 - helix-core/src/position.rs | 12 ++- helix-core/src/selection.rs | 4 +- helix-stdx/src/rope.rs | 179 ++++++++++++++++++++++++++++++++ helix-term/src/commands.rs | 24 +++-- 8 files changed, 210 insertions(+), 178 deletions(-) diff --git a/helix-core/src/doc_formatter.rs b/helix-core/src/doc_formatter.rs index cbc884d94..d74709420 100644 --- a/helix-core/src/doc_formatter.rs +++ b/helix-core/src/doc_formatter.rs @@ -19,10 +19,12 @@ mod test; use unicode_segmentation::{Graphemes, UnicodeSegmentation}; +use helix_stdx::rope::{RopeGraphemes, RopeSliceExt}; + use crate::graphemes::{Grapheme, GraphemeStr}; use crate::syntax::Highlight; use crate::text_annotations::TextAnnotations; -use crate::{Position, RopeGraphemes, RopeSlice}; +use crate::{Position, RopeSlice}; /// TODO make Highlight a u32 to reduce the size of this enum to a single word. #[derive(Debug, Clone, Copy)] @@ -219,7 +221,7 @@ impl<'t> DocumentFormatter<'t> { text_fmt, annotations, visual_pos: Position { row: 0, col: 0 }, - graphemes: RopeGraphemes::new(text.slice(block_char_idx..)), + graphemes: text.slice(block_char_idx..).graphemes(), char_pos: block_char_idx, exhausted: false, indent_level: None, diff --git a/helix-core/src/graphemes.rs b/helix-core/src/graphemes.rs index 98dfa365f..e6adeee95 100644 --- a/helix-core/src/graphemes.rs +++ b/helix-core/src/graphemes.rs @@ -1,7 +1,7 @@ //! Utility functions to traverse the unicode graphemes of a `Rope`'s text contents. //! //! Based on -use ropey::{iter::Chunks, str_utils::byte_to_char_idx, RopeSlice}; +use ropey::{str_utils::byte_to_char_idx, RopeSlice}; use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; use unicode_width::UnicodeWidthStr; @@ -270,162 +270,6 @@ pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool { } } -/// An iterator over the graphemes of a `RopeSlice`. -#[derive(Clone)] -pub struct RopeGraphemes<'a> { - text: RopeSlice<'a>, - chunks: Chunks<'a>, - cur_chunk: &'a str, - cur_chunk_start: usize, - cursor: GraphemeCursor, -} - -impl fmt::Debug for RopeGraphemes<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RopeGraphemes") - .field("text", &self.text) - .field("chunks", &self.chunks) - .field("cur_chunk", &self.cur_chunk) - .field("cur_chunk_start", &self.cur_chunk_start) - // .field("cursor", &self.cursor) - .finish() - } -} - -impl RopeGraphemes<'_> { - #[must_use] - pub fn new(slice: RopeSlice) -> RopeGraphemes { - let mut chunks = slice.chunks(); - let first_chunk = chunks.next().unwrap_or(""); - RopeGraphemes { - text: slice, - chunks, - cur_chunk: first_chunk, - cur_chunk_start: 0, - cursor: GraphemeCursor::new(0, slice.len_bytes(), true), - } - } -} - -impl<'a> Iterator for RopeGraphemes<'a> { - type Item = RopeSlice<'a>; - - fn next(&mut self) -> Option> { - let a = self.cursor.cur_cursor(); - let b; - loop { - match self - .cursor - .next_boundary(self.cur_chunk, self.cur_chunk_start) - { - Ok(None) => { - return None; - } - Ok(Some(n)) => { - b = n; - break; - } - Err(GraphemeIncomplete::NextChunk) => { - self.cur_chunk_start += self.cur_chunk.len(); - self.cur_chunk = self.chunks.next().unwrap_or(""); - } - Err(GraphemeIncomplete::PreContext(idx)) => { - let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); - self.cursor.provide_context(chunk, byte_idx); - } - _ => unreachable!(), - } - } - - if a < self.cur_chunk_start { - Some(self.text.byte_slice(a..b)) - } else { - let a2 = a - self.cur_chunk_start; - let b2 = b - self.cur_chunk_start; - Some((&self.cur_chunk[a2..b2]).into()) - } - } -} - -/// An iterator over the graphemes of a `RopeSlice` in reverse. -#[derive(Clone)] -pub struct RevRopeGraphemes<'a> { - text: RopeSlice<'a>, - chunks: Chunks<'a>, - cur_chunk: &'a str, - cur_chunk_start: usize, - cursor: GraphemeCursor, -} - -impl fmt::Debug for RevRopeGraphemes<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RevRopeGraphemes") - .field("text", &self.text) - .field("chunks", &self.chunks) - .field("cur_chunk", &self.cur_chunk) - .field("cur_chunk_start", &self.cur_chunk_start) - // .field("cursor", &self.cursor) - .finish() - } -} - -impl RevRopeGraphemes<'_> { - #[must_use] - pub fn new(slice: RopeSlice) -> RevRopeGraphemes { - let (mut chunks, mut cur_chunk_start, _, _) = slice.chunks_at_byte(slice.len_bytes()); - chunks.reverse(); - let first_chunk = chunks.next().unwrap_or(""); - cur_chunk_start -= first_chunk.len(); - RevRopeGraphemes { - text: slice, - chunks, - cur_chunk: first_chunk, - cur_chunk_start, - cursor: GraphemeCursor::new(slice.len_bytes(), slice.len_bytes(), true), - } - } -} - -impl<'a> Iterator for RevRopeGraphemes<'a> { - type Item = RopeSlice<'a>; - - fn next(&mut self) -> Option> { - let a = self.cursor.cur_cursor(); - let b; - loop { - match self - .cursor - .prev_boundary(self.cur_chunk, self.cur_chunk_start) - { - Ok(None) => { - return None; - } - Ok(Some(n)) => { - b = n; - break; - } - Err(GraphemeIncomplete::PrevChunk) => { - self.cur_chunk = self.chunks.next().unwrap_or(""); - self.cur_chunk_start -= self.cur_chunk.len(); - } - Err(GraphemeIncomplete::PreContext(idx)) => { - let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); - self.cursor.provide_context(chunk, byte_idx); - } - _ => unreachable!(), - } - } - - if a >= self.cur_chunk_start + self.cur_chunk.len() { - Some(self.text.byte_slice(b..a)) - } else { - let a2 = a - self.cur_chunk_start; - let b2 = b - self.cur_chunk_start; - Some((&self.cur_chunk[b2..a2]).into()) - } - } -} - /// A highly compressed Cow<'a, str> that holds /// atmost u31::MAX bytes and is readonly pub struct GraphemeStr<'a> { diff --git a/helix-core/src/indent.rs b/helix-core/src/indent.rs index 93eb6ce28..04ce9a28d 100644 --- a/helix-core/src/indent.rs +++ b/helix-core/src/indent.rs @@ -8,7 +8,7 @@ use crate::{ graphemes::{grapheme_width, tab_width_at}, syntax::{IndentationHeuristic, LanguageConfiguration, RopeProvider, Syntax}, tree_sitter::Node, - Position, Rope, RopeGraphemes, RopeSlice, Tendril, + Position, Rope, RopeSlice, Tendril, }; /// Enum representing indentation style. @@ -200,7 +200,7 @@ pub fn indent_level_for_line(line: RopeSlice, tab_width: usize, indent_width: us /// Create a string of tabs & spaces that has the same visual width as the given RopeSlice (independent of the tab width). fn whitespace_with_same_width(text: RopeSlice) -> String { let mut s = String::new(); - for grapheme in RopeGraphemes::new(text) { + for grapheme in text.graphemes() { if grapheme == "\t" { s.push('\t'); } else { diff --git a/helix-core/src/lib.rs b/helix-core/src/lib.rs index 2bf75f690..89c960ed5 100644 --- a/helix-core/src/lib.rs +++ b/helix-core/src/lib.rs @@ -54,7 +54,6 @@ pub type Tendril = SmartString; #[doc(inline)] pub use {regex, tree_sitter}; -pub use graphemes::RopeGraphemes; pub use position::{ char_idx_at_visual_offset, coords_at_pos, pos_at_coords, softwrapped_dimensions, visual_offset_from_anchor, visual_offset_from_block, Position, VisualOffsetError, diff --git a/helix-core/src/position.rs b/helix-core/src/position.rs index 1b3789110..cea0b6071 100644 --- a/helix-core/src/position.rs +++ b/helix-core/src/position.rs @@ -4,10 +4,12 @@ use std::{ ops::{Add, AddAssign, Sub, SubAssign}, }; +use helix_stdx::rope::RopeSliceExt; + use crate::{ chars::char_is_line_ending, doc_formatter::{DocumentFormatter, TextFormat}, - graphemes::{ensure_grapheme_boundary_prev, grapheme_width, RopeGraphemes}, + graphemes::{ensure_grapheme_boundary_prev, grapheme_width}, line_ending::line_end_char_index, text_annotations::TextAnnotations, RopeSlice, @@ -101,7 +103,7 @@ pub fn coords_at_pos(text: RopeSlice, pos: usize) -> Position { let line_start = text.line_to_char(line); let pos = ensure_grapheme_boundary_prev(text, pos); - let col = RopeGraphemes::new(text.slice(line_start..pos)).count(); + let col = text.slice(line_start..pos).graphemes().count(); Position::new(line, col) } @@ -126,7 +128,7 @@ pub fn visual_coords_at_pos(text: RopeSlice, pos: usize, tab_width: usize) -> Po let mut col = 0; - for grapheme in RopeGraphemes::new(text.slice(line_start..pos)) { + for grapheme in text.slice(line_start..pos).graphemes() { if grapheme == "\t" { col += tab_width - (col % tab_width); } else { @@ -275,7 +277,7 @@ pub fn pos_at_coords(text: RopeSlice, coords: Position, limit_before_line_ending }; let mut col_char_offset = 0; - for (i, g) in RopeGraphemes::new(text.slice(line_start..line_end)).enumerate() { + for (i, g) in text.slice(line_start..line_end).graphemes().enumerate() { if i == col { break; } @@ -306,7 +308,7 @@ pub fn pos_at_visual_coords(text: RopeSlice, coords: Position, tab_width: usize) let mut col_char_offset = 0; let mut cols_remaining = col; - for grapheme in RopeGraphemes::new(text.slice(line_start..line_end)) { + for grapheme in text.slice(line_start..line_end).graphemes() { let grapheme_width = if grapheme == "\t" { tab_width - ((col - cols_remaining) % tab_width) } else { diff --git a/helix-core/src/selection.rs b/helix-core/src/selection.rs index a134a06e9..1db2d619e 100644 --- a/helix-core/src/selection.rs +++ b/helix-core/src/selection.rs @@ -9,7 +9,7 @@ use crate::{ }, line_ending::get_line_ending, movement::Direction, - Assoc, ChangeSet, RopeGraphemes, RopeSlice, + Assoc, ChangeSet, RopeSlice, }; use helix_stdx::range::is_subset; use helix_stdx::rope::{self, RopeSliceExt}; @@ -379,7 +379,7 @@ impl Range { /// Returns true if this Range covers a single grapheme in the given text pub fn is_single_grapheme(&self, doc: RopeSlice) -> bool { - let mut graphemes = RopeGraphemes::new(doc.slice(self.from()..self.to())); + let mut graphemes = doc.slice(self.from()..self.to()).graphemes(); let first = graphemes.next(); let second = graphemes.next(); first.is_some() && second.is_none() diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs index eac1450bf..9fc348f58 100644 --- a/helix-stdx/src/rope.rs +++ b/helix-stdx/src/rope.rs @@ -1,8 +1,10 @@ +use std::fmt; use std::ops::{Bound, RangeBounds}; pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex}; pub use regex_cursor::regex_automata::util::syntax::Config; use regex_cursor::{Input as RegexInput, RopeyCursor}; +use ropey::iter::Chunks; use ropey::RopeSlice; use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; @@ -122,6 +124,33 @@ pub trait RopeSliceExt<'a>: Sized { /// ``` #[allow(clippy::wrong_self_convention)] fn is_grapheme_boundary(self, byte_idx: usize) -> bool; + /// Returns an iterator over the grapheme clusters in the slice. + /// + /// # Example + /// + /// ``` + /// # use ropey::RopeSlice; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = RopeSlice::from("πŸ˜Άβ€πŸŒ«οΈπŸ΄β€β˜ οΈπŸ–ΌοΈ"); + /// let graphemes: Vec<_> = text.graphemes().collect(); + /// assert_eq!(graphemes.as_slice(), &["πŸ˜Άβ€πŸŒ«οΈ", "πŸ΄β€β˜ οΈ", "πŸ–ΌοΈ"]); + /// ``` + fn graphemes(self) -> RopeGraphemes<'a>; + /// Returns an iterator over the grapheme clusters in the slice, reversed. + /// + /// The returned iterator starts at the end of the slice and ends at the beginning of the + /// slice. + /// + /// # Example + /// + /// ``` + /// # use ropey::RopeSlice; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = RopeSlice::from("πŸ˜Άβ€πŸŒ«οΈπŸ΄β€β˜ οΈπŸ–ΌοΈ"); + /// let graphemes: Vec<_> = text.graphemes_rev().collect(); + /// assert_eq!(graphemes.as_slice(), &["πŸ–ΌοΈ", "πŸ΄β€β˜ οΈ", "πŸ˜Άβ€πŸŒ«οΈ"]); + /// ``` + fn graphemes_rev(self) -> RevRopeGraphemes<'a>; } impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { @@ -305,6 +334,32 @@ impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { } } } + + fn graphemes(self) -> RopeGraphemes<'a> { + let mut chunks = self.chunks(); + let first_chunk = chunks.next().unwrap_or(""); + RopeGraphemes { + text: self, + chunks, + cur_chunk: first_chunk, + cur_chunk_start: 0, + cursor: GraphemeCursor::new(0, self.len_bytes(), true), + } + } + + fn graphemes_rev(self) -> RevRopeGraphemes<'a> { + let (mut chunks, mut cur_chunk_start, _, _) = self.chunks_at_byte(self.len_bytes()); + chunks.reverse(); + let first_chunk = chunks.next().unwrap_or(""); + cur_chunk_start -= first_chunk.len(); + RevRopeGraphemes { + text: self, + chunks, + cur_chunk: first_chunk, + cur_chunk_start, + cursor: GraphemeCursor::new(self.len_bytes(), self.len_bytes(), true), + } + } } // copied from std @@ -314,6 +369,130 @@ const fn is_utf8_char_boundary(b: u8) -> bool { (b as i8) >= -0x40 } +/// An iterator over the graphemes of a `RopeSlice`. +#[derive(Clone)] +pub struct RopeGraphemes<'a> { + text: RopeSlice<'a>, + chunks: Chunks<'a>, + cur_chunk: &'a str, + cur_chunk_start: usize, + cursor: GraphemeCursor, +} + +impl fmt::Debug for RopeGraphemes<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("RopeGraphemes") + .field("text", &self.text) + .field("chunks", &self.chunks) + .field("cur_chunk", &self.cur_chunk) + .field("cur_chunk_start", &self.cur_chunk_start) + // .field("cursor", &self.cursor) + .finish() + } +} + +impl<'a> Iterator for RopeGraphemes<'a> { + type Item = RopeSlice<'a>; + + fn next(&mut self) -> Option { + let a = self.cursor.cur_cursor(); + let b; + loop { + match self + .cursor + .next_boundary(self.cur_chunk, self.cur_chunk_start) + { + Ok(None) => { + return None; + } + Ok(Some(n)) => { + b = n; + break; + } + Err(GraphemeIncomplete::NextChunk) => { + self.cur_chunk_start += self.cur_chunk.len(); + self.cur_chunk = self.chunks.next().unwrap_or(""); + } + Err(GraphemeIncomplete::PreContext(idx)) => { + let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); + self.cursor.provide_context(chunk, byte_idx); + } + _ => unreachable!(), + } + } + + if a < self.cur_chunk_start { + Some(self.text.byte_slice(a..b)) + } else { + let a2 = a - self.cur_chunk_start; + let b2 = b - self.cur_chunk_start; + Some((&self.cur_chunk[a2..b2]).into()) + } + } +} + +/// An iterator over the graphemes of a `RopeSlice` in reverse. +#[derive(Clone)] +pub struct RevRopeGraphemes<'a> { + text: RopeSlice<'a>, + chunks: Chunks<'a>, + cur_chunk: &'a str, + cur_chunk_start: usize, + cursor: GraphemeCursor, +} + +impl fmt::Debug for RevRopeGraphemes<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("RevRopeGraphemes") + .field("text", &self.text) + .field("chunks", &self.chunks) + .field("cur_chunk", &self.cur_chunk) + .field("cur_chunk_start", &self.cur_chunk_start) + // .field("cursor", &self.cursor) + .finish() + } +} + +impl<'a> Iterator for RevRopeGraphemes<'a> { + type Item = RopeSlice<'a>; + + fn next(&mut self) -> Option { + let a = self.cursor.cur_cursor(); + let b; + loop { + match self + .cursor + .prev_boundary(self.cur_chunk, self.cur_chunk_start) + { + Ok(None) => { + return None; + } + Ok(Some(n)) => { + b = n; + break; + } + Err(GraphemeIncomplete::PrevChunk) => { + self.cur_chunk = self.chunks.next().unwrap_or(""); + self.cur_chunk_start -= self.cur_chunk.len(); + } + Err(GraphemeIncomplete::PreContext(idx)) => { + let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); + self.cursor.provide_context(chunk, byte_idx); + } + _ => unreachable!(), + } + } + + if a >= self.cur_chunk_start + self.cur_chunk.len() { + Some(self.text.byte_slice(b..a)) + } else { + let a2 = a - self.cur_chunk_start; + let b2 = b - self.cur_chunk_start; + Some((&self.cur_chunk[b2..a2]).into()) + } + } +} + #[cfg(test)] mod tests { use ropey::RopeSlice; diff --git a/helix-term/src/commands.rs b/helix-term/src/commands.rs index 851f90353..2a511cbeb 100644 --- a/helix-term/src/commands.rs +++ b/helix-term/src/commands.rs @@ -20,7 +20,7 @@ use helix_core::{ comment, doc_formatter::TextFormat, encoding, find_workspace, - graphemes::{self, next_grapheme_boundary, RevRopeGraphemes}, + graphemes::{self, next_grapheme_boundary}, history::UndoKind, increment, indent::{self, IndentStyle}, @@ -35,8 +35,8 @@ use helix_core::{ text_annotations::{Overlay, TextAnnotations}, textobject, unicode::width::UnicodeWidthChar, - visual_offset_from_block, Deletion, LineEnding, Position, Range, Rope, RopeGraphemes, - RopeReader, RopeSlice, Selection, SmallVec, Syntax, Tendril, Transaction, + visual_offset_from_block, Deletion, LineEnding, Position, Range, Rope, RopeReader, RopeSlice, + Selection, SmallVec, Syntax, Tendril, Transaction, }; use helix_view::{ document::{FormatterError, Mode, SCRATCH_BUFFER_NAME}, @@ -1681,10 +1681,12 @@ fn replace(cx: &mut Context) { if let Some(ch) = ch { let transaction = Transaction::change_by_selection(doc.text(), selection, |range| { if !range.is_empty() { - let text: Tendril = - RopeGraphemes::new(doc.text().slice(range.from()..range.to())) - .map(|_g| ch) - .collect(); + let text: Tendril = doc + .text() + .slice(range.from()..range.to()) + .graphemes() + .map(|_g| ch) + .collect(); (range.from(), range.to(), Some(text)) } else { // No change. @@ -6574,7 +6576,9 @@ fn jump_to_word(cx: &mut Context, behaviour: Movement) { // madeup of word characters. The latter condition is needed because // move_next_word_end simply treats a sequence of characters from // the same char class as a word so `=<` would also count as a word. - let add_label = RevRopeGraphemes::new(text.slice(..cursor_fwd.head)) + let add_label = text + .slice(..cursor_fwd.head) + .graphemes_rev() .take(2) .take_while(|g| g.chars().all(char_is_word)) .count() @@ -6600,7 +6604,9 @@ fn jump_to_word(cx: &mut Context, behaviour: Movement) { // madeup of word characters. The latter condition is needed because // move_prev_word_start simply treats a sequence of characters from // the same char class as a word so `=<` would also count as a word. - let add_label = RopeGraphemes::new(text.slice(cursor_rev.head..)) + let add_label = text + .slice(cursor_rev.head..) + .graphemes() .take(2) .take_while(|g| g.chars().all(char_is_word)) .count()