This commit is contained in:
Kirawi 2025-04-01 21:39:19 +09:00 committed by GitHub
commit 57c5977086
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 46 additions and 11 deletions

View file

@ -2,11 +2,23 @@
use crate::LineEnding; use crate::LineEnding;
#[derive(Debug, Eq, PartialEq)]
pub enum WordCategory {
Alphanumeric,
Superscript,
Subscript,
Braille,
Hiragana,
Katakana,
HangulSyllable,
CJKIdeograph,
}
#[derive(Debug, Eq, PartialEq)] #[derive(Debug, Eq, PartialEq)]
pub enum CharCategory { pub enum CharCategory {
Whitespace, Whitespace,
Eol, Eol,
Word, Word(WordCategory),
Punctuation, Punctuation,
Unknown, Unknown,
} }
@ -17,8 +29,8 @@ pub fn categorize_char(ch: char) -> CharCategory {
CharCategory::Eol CharCategory::Eol
} else if ch.is_whitespace() { } else if ch.is_whitespace() {
CharCategory::Whitespace CharCategory::Whitespace
} else if char_is_word(ch) { } else if let Some(cat) = char_word_category(ch) {
CharCategory::Word CharCategory::Word(cat)
} else if char_is_punctuation(ch) { } else if char_is_punctuation(ch) {
CharCategory::Punctuation CharCategory::Punctuation
} else { } else {
@ -55,7 +67,7 @@ pub fn char_is_whitespace(ch: char) -> bool {
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space, // En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
// Four-per-em Space, Six-per-em Space, Figure Space, // Four-per-em Space, Six-per-em Space, Figure Space,
// Punctuation Space, Thin Space, Hair Space, Zero Width Space. // Punctuation Space, Thin Space, Hair Space, Zero Width Space.
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true, '\u{2000}' ..= '\u{200B}' => true,
_ => false, _ => false,
} }
@ -82,7 +94,31 @@ pub fn char_is_punctuation(ch: char) -> bool {
#[inline] #[inline]
pub fn char_is_word(ch: char) -> bool { pub fn char_is_word(ch: char) -> bool {
ch.is_alphanumeric() || ch == '_' char_word_category(ch).is_some()
}
pub fn char_word_category(ch: char) -> Option<WordCategory> {
use WordCategory::*;
// Different subcategories so e.g. おはよう世界 is not treated as one block
let level = match ch {
'\u{2070}'..='\u{207f}' => Superscript,
'\u{2080}'..='\u{2094}' => Subscript,
'\u{2800}'..='\u{28ff}' => Braille,
'\u{3040}'..='\u{309f}' => Hiragana,
'\u{30a0}'..='\u{30ff}' => Katakana,
'\u{ac00}'..='\u{d7a3}' => HangulSyllable,
'\u{3300}'..='\u{9fff}'
| '\u{f900}'..='\u{faff}'
| '\u{20000}'..='\u{2a6df}'
| '\u{2a700}'..='\u{2b73f}'
| '\u{2b740}'..='\u{2b81f}'
| '\u{2f800}'..='\u{2fa1f}' => CJKIdeograph,
ch if ch.is_alphanumeric() || ch == '_' => Alphanumeric,
_ => return None,
};
Some(level)
} }
#[cfg(test)] #[cfg(test)]
@ -115,9 +151,8 @@ mod test {
} }
for ch in WORD_TEST_CASE.chars() { for ch in WORD_TEST_CASE.chars() {
assert_eq!( assert!(
CharCategory::Word, matches!(categorize_char(ch), CharCategory::Word(_)),
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Word`", "Testing '{}', but got `{:?}` instead of `Category::Word`",
ch, ch,
categorize_char(ch) categorize_char(ch)

View file

@ -495,8 +495,8 @@ fn is_word_boundary(a: char, b: char) -> bool {
fn is_long_word_boundary(a: char, b: char) -> bool { fn is_long_word_boundary(a: char, b: char) -> bool {
match (categorize_char(a), categorize_char(b)) { match (categorize_char(a), categorize_char(b)) {
(CharCategory::Word, CharCategory::Punctuation) (CharCategory::Word(_), CharCategory::Punctuation)
| (CharCategory::Punctuation, CharCategory::Word) => false, | (CharCategory::Punctuation, CharCategory::Word(_)) => false,
(a, b) if a != b => true, (a, b) if a != b => true,
_ => false, _ => false,
} }
@ -504,7 +504,7 @@ fn is_long_word_boundary(a: char, b: char) -> bool {
fn is_sub_word_boundary(a: char, b: char, dir: Direction) -> bool { fn is_sub_word_boundary(a: char, b: char, dir: Direction) -> bool {
match (categorize_char(a), categorize_char(b)) { match (categorize_char(a), categorize_char(b)) {
(CharCategory::Word, CharCategory::Word) => { (CharCategory::Word(_), CharCategory::Word(_)) => {
if (a == '_') != (b == '_') { if (a == '_') != (b == '_') {
return true; return true;
} }