mirror of
https://github.com/helix-editor/helix.git
synced 2025-04-04 19:37:54 +03:00
Merge 34531b8278
into 7ebf650029
This commit is contained in:
commit
57c5977086
2 changed files with 46 additions and 11 deletions
|
@ -2,11 +2,23 @@
|
||||||
|
|
||||||
use crate::LineEnding;
|
use crate::LineEnding;
|
||||||
|
|
||||||
|
#[derive(Debug, Eq, PartialEq)]
|
||||||
|
pub enum WordCategory {
|
||||||
|
Alphanumeric,
|
||||||
|
Superscript,
|
||||||
|
Subscript,
|
||||||
|
Braille,
|
||||||
|
Hiragana,
|
||||||
|
Katakana,
|
||||||
|
HangulSyllable,
|
||||||
|
CJKIdeograph,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq)]
|
#[derive(Debug, Eq, PartialEq)]
|
||||||
pub enum CharCategory {
|
pub enum CharCategory {
|
||||||
Whitespace,
|
Whitespace,
|
||||||
Eol,
|
Eol,
|
||||||
Word,
|
Word(WordCategory),
|
||||||
Punctuation,
|
Punctuation,
|
||||||
Unknown,
|
Unknown,
|
||||||
}
|
}
|
||||||
|
@ -17,8 +29,8 @@ pub fn categorize_char(ch: char) -> CharCategory {
|
||||||
CharCategory::Eol
|
CharCategory::Eol
|
||||||
} else if ch.is_whitespace() {
|
} else if ch.is_whitespace() {
|
||||||
CharCategory::Whitespace
|
CharCategory::Whitespace
|
||||||
} else if char_is_word(ch) {
|
} else if let Some(cat) = char_word_category(ch) {
|
||||||
CharCategory::Word
|
CharCategory::Word(cat)
|
||||||
} else if char_is_punctuation(ch) {
|
} else if char_is_punctuation(ch) {
|
||||||
CharCategory::Punctuation
|
CharCategory::Punctuation
|
||||||
} else {
|
} else {
|
||||||
|
@ -55,7 +67,7 @@ pub fn char_is_whitespace(ch: char) -> bool {
|
||||||
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
|
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
|
||||||
// Four-per-em Space, Six-per-em Space, Figure Space,
|
// Four-per-em Space, Six-per-em Space, Figure Space,
|
||||||
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
|
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
|
||||||
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
|
'\u{2000}' ..= '\u{200B}' => true,
|
||||||
|
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
}
|
||||||
|
@ -82,7 +94,31 @@ pub fn char_is_punctuation(ch: char) -> bool {
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn char_is_word(ch: char) -> bool {
|
pub fn char_is_word(ch: char) -> bool {
|
||||||
ch.is_alphanumeric() || ch == '_'
|
char_word_category(ch).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn char_word_category(ch: char) -> Option<WordCategory> {
|
||||||
|
use WordCategory::*;
|
||||||
|
|
||||||
|
// Different subcategories so e.g. おはよう世界 is not treated as one block
|
||||||
|
let level = match ch {
|
||||||
|
'\u{2070}'..='\u{207f}' => Superscript,
|
||||||
|
'\u{2080}'..='\u{2094}' => Subscript,
|
||||||
|
'\u{2800}'..='\u{28ff}' => Braille,
|
||||||
|
'\u{3040}'..='\u{309f}' => Hiragana,
|
||||||
|
'\u{30a0}'..='\u{30ff}' => Katakana,
|
||||||
|
'\u{ac00}'..='\u{d7a3}' => HangulSyllable,
|
||||||
|
|
||||||
|
'\u{3300}'..='\u{9fff}'
|
||||||
|
| '\u{f900}'..='\u{faff}'
|
||||||
|
| '\u{20000}'..='\u{2a6df}'
|
||||||
|
| '\u{2a700}'..='\u{2b73f}'
|
||||||
|
| '\u{2b740}'..='\u{2b81f}'
|
||||||
|
| '\u{2f800}'..='\u{2fa1f}' => CJKIdeograph,
|
||||||
|
ch if ch.is_alphanumeric() || ch == '_' => Alphanumeric,
|
||||||
|
_ => return None,
|
||||||
|
};
|
||||||
|
Some(level)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -115,9 +151,8 @@ mod test {
|
||||||
}
|
}
|
||||||
|
|
||||||
for ch in WORD_TEST_CASE.chars() {
|
for ch in WORD_TEST_CASE.chars() {
|
||||||
assert_eq!(
|
assert!(
|
||||||
CharCategory::Word,
|
matches!(categorize_char(ch), CharCategory::Word(_)),
|
||||||
categorize_char(ch),
|
|
||||||
"Testing '{}', but got `{:?}` instead of `Category::Word`",
|
"Testing '{}', but got `{:?}` instead of `Category::Word`",
|
||||||
ch,
|
ch,
|
||||||
categorize_char(ch)
|
categorize_char(ch)
|
||||||
|
|
|
@ -495,8 +495,8 @@ fn is_word_boundary(a: char, b: char) -> bool {
|
||||||
|
|
||||||
fn is_long_word_boundary(a: char, b: char) -> bool {
|
fn is_long_word_boundary(a: char, b: char) -> bool {
|
||||||
match (categorize_char(a), categorize_char(b)) {
|
match (categorize_char(a), categorize_char(b)) {
|
||||||
(CharCategory::Word, CharCategory::Punctuation)
|
(CharCategory::Word(_), CharCategory::Punctuation)
|
||||||
| (CharCategory::Punctuation, CharCategory::Word) => false,
|
| (CharCategory::Punctuation, CharCategory::Word(_)) => false,
|
||||||
(a, b) if a != b => true,
|
(a, b) if a != b => true,
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
}
|
||||||
|
@ -504,7 +504,7 @@ fn is_long_word_boundary(a: char, b: char) -> bool {
|
||||||
|
|
||||||
fn is_sub_word_boundary(a: char, b: char, dir: Direction) -> bool {
|
fn is_sub_word_boundary(a: char, b: char, dir: Direction) -> bool {
|
||||||
match (categorize_char(a), categorize_char(b)) {
|
match (categorize_char(a), categorize_char(b)) {
|
||||||
(CharCategory::Word, CharCategory::Word) => {
|
(CharCategory::Word(_), CharCategory::Word(_)) => {
|
||||||
if (a == '_') != (b == '_') {
|
if (a == '_') != (b == '_') {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue