From ccc9773030eaf146dca07bd77d6b7d9dc3bdf1af Mon Sep 17 00:00:00 2001 From: Artemy Egorov Date: Sun, 11 Aug 2024 18:21:24 +0300 Subject: [PATCH] refactor(lexer): make functions for specific tokens --- examples/daleth_lexer.rs | 4 +- src/daleth/lexer/mod.rs | 230 +++++++++++++++++++++----------------- src/daleth/lexer/types.rs | 4 +- src/main.rs | 4 +- 4 files changed, 136 insertions(+), 106 deletions(-) diff --git a/examples/daleth_lexer.rs b/examples/daleth_lexer.rs index dfffa73..28a54ae 100644 --- a/examples/daleth_lexer.rs +++ b/examples/daleth_lexer.rs @@ -1,12 +1,12 @@ use ariadne::{Color, Label, Report, ReportKind, Source}; use chumsky::Parser; -use dalet::daleth::{format::format, lexer::lexer}; +use dalet::daleth::{format::format, lexer::full_lexer}; fn main() { let src_file = "daleth.dlth"; let src = include_str!("./daleth.dlth"); - let parsed = lexer().parse(src); + let parsed = full_lexer().parse(src); match parsed.into_result() { Ok(t) => { diff --git a/src/daleth/lexer/mod.rs b/src/daleth/lexer/mod.rs index 9df1d08..3d74435 100644 --- a/src/daleth/lexer/mod.rs +++ b/src/daleth/lexer/mod.rs @@ -4,7 +4,36 @@ pub mod types; pub fn lexer<'src>( ) -> impl Parser<'src, &'src str, Vec>>, extra::Err>> { - let tag = choice(( + let token = choice((symbol(), tag(), argument(), textual())); + + token + .padded() + .padded_by(comment()) + .map_with(|t, e| (t, e.span())) + .repeated() + .collect() +} + +pub fn full_lexer<'src>( +) -> impl Parser<'src, &'src str, Vec>>, extra::Err>> { + let token = choice(( + empty_line(), + comment(), + symbol(), + tag(), + argument(), + textual(), + )); + + token + .padded_by(text::whitespace().and_is(empty_line().not()).or_not()) + .map_with(|t, e| (t, e.span())) + .repeated() + .collect() +} + +fn tag<'src>() -> impl Parser<'src, &'src str, Token<'src>, extra::Err>> { + choice(( just("el").to(Token::El), just("h").to(Token::H), just("p").to(Token::P), @@ -39,108 +68,109 @@ pub fn lexer<'src>( just("pre").to(Token::Pre), just("meta").to(Token::Meta), ))) - .labelled("Tag"); + .labelled("Tag") +} - let symbol = choice(( +fn symbol<'src>() -> impl Parser<'src, &'src str, Token<'src>, extra::Err>> { + choice(( just("[[").to(Token::ElOpen).labelled("[["), just("]]").to(Token::ElClose).labelled("]]"), just("[").to(Token::LSquare).labelled("["), just("]").to(Token::RSquare).labelled("]"), - )); - - let argument = { - let arg_escape = just('\\') - .ignore_then(just('"')) - .labelled("Escape sequence for argument"); - - let number = text::int(10) - .from_str() - .unwrapped() - .map(Token::NumberArgument) - .labelled("Number argument"); - - let text_argument = none_of("\"\n\\") - .or(arg_escape) - .repeated() - .to_slice() - .delimited_by(just('"'), just('"')) - .map(Token::TextArgument) - .labelled("Text argument"); - - choice((number, text_argument)) - }; - - let textual = { - let escape = just('\\') - .ignore_then(just('}')) - .labelled("Multi-line escape sequence"); - - let text = none_of("\n") - .repeated() - .to_slice() - .padded_by(text::inline_whitespace()); - - let text_body = just(':') - .ignore_then(text) - .map(Token::TextBody) - .labelled("One line text body"); - - let text_tag = text - .then_ignore(just('\n')) - .map(Token::TextTag) - .labelled("Text tag"); - - let multiline_text_body = none_of("}\\") - .or(escape) - .repeated() - .to_slice() - .labelled("Body of multiline text"); - - let paragraph = multiline_text_body - .clone() - .delimited_by(just("{-"), just("}")) - .map(Token::Paragraph) - .labelled("Paragraph syntax"); - - let mltext = multiline_text_body - .clone() - .delimited_by(just('{'), just('}')) - .map(Token::MLText) - .labelled("Multiline text"); - - let mlmstext = { - let mlms_n = just("{~") - .ignore_then(text::int(10).from_str().unwrapped()) - .labelled("Minimum spaces number"); - - mlms_n - .then(multiline_text_body.clone()) - .then_ignore(just("}")) - .map(|(n, t)| Token::MLMSText(n, t)) - .labelled("Multi line text with min spaces") - }; - - let rmltext = multiline_text_body - .delimited_by(just("{#"), just('}')) - .map(Token::RMLText) - .labelled("Raw multiline text"); - - choice((paragraph, mlmstext, rmltext, mltext, text_body, text_tag)) - }; - - let comment = just('#') - .ignore_then(none_of("\n").repeated().to_slice()) - .map(Token::Comment); - - let empty_line = text::inline_whitespace() - .delimited_by(text::newline(), text::newline()) - .to(Token::EmptyLine); - - let token = choice((empty_line.clone(), comment, symbol, tag, argument, textual)); - - token - .padded_by(text::whitespace().and_is(empty_line.not()).or_not()) - .map_with(|t, e| (t, e.span())) - .repeated() - .collect() + )) +} + +fn argument<'src>() -> impl Parser<'src, &'src str, Token<'src>, extra::Err>> +{ + let arg_escape = just('\\') + .ignore_then(just('"')) + .labelled("Escape sequence for argument"); + + let number = text::int(10) + .from_str() + .unwrapped() + .map(Token::NumberArgument) + .labelled("Number argument"); + + let text_argument = none_of("\"\n\\") + .or(arg_escape) + .repeated() + .to_slice() + .delimited_by(just('"'), just('"')) + .map(Token::TextArgument) + .labelled("Text argument"); + + choice((number, text_argument)) +} + +fn textual<'src>() -> impl Parser<'src, &'src str, Token<'src>, extra::Err>> +{ + let escape = just('\\') + .ignore_then(just('}')) + .labelled("Multi-line escape sequence"); + + let text = none_of("\n") + .repeated() + .to_slice() + .padded_by(text::inline_whitespace()); + + let text_body = just(':') + .ignore_then(text) + .map(Token::TextBody) + .labelled("One line text body"); + + let text_tag = text + .then_ignore(just('\n')) + .map(Token::TextTag) + .labelled("Text tag"); + + let multiline_text_body = none_of("}\\") + .or(escape) + .repeated() + .to_slice() + .labelled("Body of multiline text"); + + let paragraph = multiline_text_body + .clone() + .delimited_by(just("{-"), just("}")) + .map(Token::Paragraph) + .labelled("Paragraph syntax"); + + let mltext = multiline_text_body + .clone() + .delimited_by(just('{'), just('}')) + .map(Token::MLText) + .labelled("Multiline text"); + + let mlmstext = { + let mlms_n = just("{~") + .ignore_then(text::int(10).from_str().unwrapped()) + .labelled("Minimum spaces number"); + + mlms_n + .then(multiline_text_body.clone()) + .then_ignore(just("}")) + .map(|(n, t)| Token::MLMSText(n, t)) + .labelled("Multi line text with min spaces") + }; + + let rmltext = multiline_text_body + .delimited_by(just("{#"), just('}')) + .map(Token::RMLText) + .labelled("Raw multiline text"); + + choice((paragraph, mlmstext, rmltext, mltext, text_body, text_tag)) +} + +fn comment<'src>() -> impl Parser<'src, &'src str, Token<'src>, extra::Err>> +{ + just('#') + .ignore_then(none_of("\n").repeated().to_slice()) + .map(Token::Comment) +} +fn empty_line<'src>( +) -> impl Parser<'src, &'src str, Token<'src>, extra::Err>> { + text::inline_whitespace() + .delimited_by(text::newline(), text::newline()) + .to(Token::EmptyLine) } diff --git a/src/daleth/lexer/types.rs b/src/daleth/lexer/types.rs index 53ef6a3..0aa9eb2 100644 --- a/src/daleth/lexer/types.rs +++ b/src/daleth/lexer/types.rs @@ -28,11 +28,11 @@ pub enum Token<'src> { /// Raw Multi line text RMLText(&'src str), - /// Special + // Special TextTag(&'src str), Paragraph(&'src str), - /// Special removed before parse + // Special for formatting, ignored for parse Comment(&'src str), EmptyLine, diff --git a/src/main.rs b/src/main.rs index 33268fc..aa343ec 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ use ariadne::{Color, Label, Report, ReportKind, Source}; use chumsky::Parser; use clap::Parser as ClapParser; use commands::{Cli, Commands::*}; -use dalet::daleth::{format::format, lexer::lexer}; +use dalet::daleth::{format::format, lexer::full_lexer}; use std::fs; fn main() { @@ -16,7 +16,7 @@ fn main() { let src_file = &path.to_string_lossy().to_string(); let src = fs::read_to_string(src_file).unwrap(); - let parsed = lexer().parse(&src); + let parsed = full_lexer().parse(&src); match parsed.into_result() { Ok(t) => {