From 626570149923706f913056e3c2d84d2765e39a71 Mon Sep 17 00:00:00 2001 From: Artemy Egorov Date: Fri, 2 Aug 2024 19:49:10 +0300 Subject: [PATCH] feat: gemtext parser, pre tag --- Cargo.lock | 138 +++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 4 +- src/abstractions.rs | 2 + src/daletl.rs | 1 + src/lib.rs | 3 + src/parsers/gemtext.rs | 68 ++++++++++++++++++++ src/parsers/mod.rs | 1 + tests/gemtext.gmi | 32 ++++++++++ tests/gemtext.rs | 10 +++ 9 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 src/parsers/gemtext.rs create mode 100644 src/parsers/mod.rs create mode 100644 tests/gemtext.gmi create mode 100644 tests/gemtext.rs diff --git a/Cargo.lock b/Cargo.lock index ebbe47e..5b6f371 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,33 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + [[package]] name = "anstream" version = "0.6.15" @@ -94,6 +121,19 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chumsky" +version = "1.0.0-alpha.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7b80276986f86789dc56ca6542d53bba9cda3c66091ebbe7bd96fc1bdf20f1f" +dependencies = [ + "hashbrown", + "regex-automata", + "serde", + "stacker", + "unicode-ident", +] + [[package]] name = "clap" version = "4.5.13" @@ -154,6 +194,7 @@ name = "dalet" version = "1.0.0-pre4" dependencies = [ "bincode", + "chumsky", "clap", "flate2", "num_enum", @@ -184,6 +225,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "heck" @@ -267,6 +312,12 @@ dependencies = [ "syn", ] +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + [[package]] name = "paste" version = "1.0.15" @@ -297,6 +348,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psm" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5787f7cda34e3033a72192c018bc5883100330f362ef279a8cbccfce8bb4e874" +dependencies = [ + "cc", +] + [[package]] name = "quote" version = "1.0.36" @@ -306,6 +366,23 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex-automata" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + [[package]] name = "rmp" version = "0.8.14" @@ -359,6 +436,19 @@ dependencies = [ "syn", ] +[[package]] +name = "stacker" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "winapi", +] + [[package]] name = "strsim" version = "0.11.1" @@ -405,6 +495,34 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-sys" version = "0.52.0" @@ -487,6 +605,26 @@ dependencies = [ "memchr", ] +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index 00f7017..9ee130a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ num_enum = "0.7.3" serde = { version = "1.0", features = ["derive"] } serde_repr = "0.1" zstd = "0.13.2" +chumsky = { version = "=1.0.0-alpha.7", features = ["label"], optional = true } [dev-dependencies] rmp-serde = { version = "1.3.0" } @@ -25,6 +26,7 @@ flate2 = "1.0" [features] -default = ["types", "daletpack"] +default = ["types", "daletpack", "parsers"] +parsers = ["dep:chumsky", "types"] types = [] daletpack = ["types"] diff --git a/src/abstractions.rs b/src/abstractions.rs index deacb19..703b025 100644 --- a/src/abstractions.rs +++ b/src/abstractions.rs @@ -36,6 +36,7 @@ pub enum Tag { Bl(NotNullBody, AlignArgument), Carousel(Vec), Code(String, TextOrNullArgument), + Pre(String), } pub trait ToDaletl { @@ -78,6 +79,7 @@ impl ToDaletlTag for Tag { Tag::Bl(b, a) => t_new(Tid::Bl, b.to_daletl_body(), a.to_daletl_argument()), Tag::Carousel(b) => t_new(Tid::Carousel, b.to_daletl_body(), NA), Tag::Code(s, a) => t_new(Tid::Code, s.to_daletl_body(), a.to_daletl_argument()), + Tag::Pre(s) => t_new(Tid::Pre, s.to_daletl_body(), NA), } } } diff --git a/src/daletl.rs b/src/daletl.rs index bde9dc8..2f4bac9 100644 --- a/src/daletl.rs +++ b/src/daletl.rs @@ -92,4 +92,5 @@ pub enum Tid { Bl, Carousel, Code, + Pre, } diff --git a/src/lib.rs b/src/lib.rs index ed28b94..e82dc0f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,3 +6,6 @@ pub mod abstractions; #[cfg(feature = "daletpack")] pub mod daletpack; + +#[cfg(feature = "parsers")] +pub mod parsers; diff --git a/src/parsers/gemtext.rs b/src/parsers/gemtext.rs new file mode 100644 index 0000000..2b57888 --- /dev/null +++ b/src/parsers/gemtext.rs @@ -0,0 +1,68 @@ +use crate::abstractions::{Body, HeadingLevel, NotNullBody, Tag}; + +#[derive(Debug)] +pub enum GemTextParseError { + InvalidLink, +} + +pub fn parse_gemtext(s: String) -> Result, GemTextParseError> { + let mut page: Vec = Vec::new(); + let mut preformatted = false; + let mut preformatted_text = String::new(); + + let mut before_is_ordered_list = false; + let mut ordered_list: Vec = Vec::new(); + + for line in s.lines() { + let mut line = line.trim().to_owned(); + + if before_is_ordered_list && !line.starts_with("* ") { + page.push(Tag::Ul(ordered_list.clone())); + before_is_ordered_list = false; + ordered_list.clear(); + } else if preformatted && !line.starts_with("```") { + preformatted_text.push_str(&line); + preformatted_text.push('\n'); + } else if line.starts_with("=>") { + let body = line.split_off(2); + let mut body = body.trim().splitn(2, " "); + + let url = body.next().ok_or(GemTextParseError::InvalidLink)?.trim(); + + match body.next() { + Some(label) => page.push(Tag::Link( + Body::Text(label.trim().to_owned()), + url.to_owned(), + )), + None => page.push(Tag::Link(Body::Null, url.to_owned())), + }; + } else if line.starts_with("# ") { + let body = line.split_off(2); + page.push(Tag::H(body.trim().to_owned(), HeadingLevel::One)); + } else if line.starts_with("## ") { + let body = line.split_off(3); + page.push(Tag::H(body.trim().to_owned(), HeadingLevel::Two)); + } else if line.starts_with("### ") { + let body = line.split_off(4); + page.push(Tag::H(body.trim().to_owned(), HeadingLevel::Three)); + } else if line.starts_with("* ") { + before_is_ordered_list = true; + let body = line.split_off(2); + ordered_list.push(Tag::El(NotNullBody::Text(body))); + } else if line.starts_with("> ") { + let body = line.split_off(2); + page.push(Tag::Bq(NotNullBody::Text(body))); + } else if line.starts_with("```") { + if preformatted { + page.push(Tag::Pre(preformatted_text.clone())); + preformatted_text.clear(); + } + + preformatted = !preformatted; + } else if !line.is_empty() { + page.push(Tag::P(NotNullBody::Text(line))); + } + } + + Ok(page) +} diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs new file mode 100644 index 0000000..792e7c1 --- /dev/null +++ b/src/parsers/mod.rs @@ -0,0 +1 @@ +pub mod gemtext; diff --git a/tests/gemtext.gmi b/tests/gemtext.gmi new file mode 100644 index 0000000..8428b79 --- /dev/null +++ b/tests/gemtext.gmi @@ -0,0 +1,32 @@ +=> https://example.com A cool website +=> gopher://example.com An even cooler gopherhole +=> gemini://example.com A supremely cool Gemini capsule +=> sftp://example.com + +Hi + +=>https://example.com A cool website +=>gopher://example.com An even cooler gopherhole +=> gemini://example.com A supremely cool Gemini capsule +=> sftp://example.com + +# Heading + +## Sub-heading + +### Sub-sub-heading + +* Mercury +* Gemini +* Apollo + +> Gemtext supports blockquotes. The quoted content is written as a single long line, which begins with a single > character + +``` +preformatted +=> () +# false heading +text +``` + +This is paragraph diff --git a/tests/gemtext.rs b/tests/gemtext.rs new file mode 100644 index 0000000..27edf8b --- /dev/null +++ b/tests/gemtext.rs @@ -0,0 +1,10 @@ +use dalet::parsers::gemtext::parse_gemtext; + +#[test] +fn gem_text() { + let text = include_str!("./gemtext.gmi"); + + let parsed = parse_gemtext(text.to_owned()).unwrap(); + + println!("{:#?}", parsed); +}