feat: gemtext parser, pre tag

This commit is contained in:
Artemy Egorov 2024-08-02 19:49:10 +03:00
parent 856534c22f
commit 6265701499
9 changed files with 258 additions and 1 deletions

138
Cargo.lock generated
View file

@ -8,6 +8,33 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "ahash"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "allocator-api2"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
[[package]]
name = "anstream"
version = "0.6.15"
@ -94,6 +121,19 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chumsky"
version = "1.0.0-alpha.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7b80276986f86789dc56ca6542d53bba9cda3c66091ebbe7bd96fc1bdf20f1f"
dependencies = [
"hashbrown",
"regex-automata",
"serde",
"stacker",
"unicode-ident",
]
[[package]]
name = "clap"
version = "4.5.13"
@ -154,6 +194,7 @@ name = "dalet"
version = "1.0.0-pre4"
dependencies = [
"bincode",
"chumsky",
"clap",
"flate2",
"num_enum",
@ -184,6 +225,10 @@ name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "heck"
@ -267,6 +312,12 @@ dependencies = [
"syn",
]
[[package]]
name = "once_cell"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "paste"
version = "1.0.15"
@ -297,6 +348,15 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "psm"
version = "0.1.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5787f7cda34e3033a72192c018bc5883100330f362ef279a8cbccfce8bb4e874"
dependencies = [
"cc",
]
[[package]]
name = "quote"
version = "1.0.36"
@ -306,6 +366,23 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "regex-automata"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
[[package]]
name = "rmp"
version = "0.8.14"
@ -359,6 +436,19 @@ dependencies = [
"syn",
]
[[package]]
name = "stacker"
version = "0.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce"
dependencies = [
"cc",
"cfg-if",
"libc",
"psm",
"winapi",
]
[[package]]
name = "strsim"
version = "0.11.1"
@ -405,6 +495,34 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.52.0"
@ -487,6 +605,26 @@ dependencies = [
"memchr",
]
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zstd"
version = "0.13.2"

View file

@ -17,6 +17,7 @@ num_enum = "0.7.3"
serde = { version = "1.0", features = ["derive"] }
serde_repr = "0.1"
zstd = "0.13.2"
chumsky = { version = "=1.0.0-alpha.7", features = ["label"], optional = true }
[dev-dependencies]
rmp-serde = { version = "1.3.0" }
@ -25,6 +26,7 @@ flate2 = "1.0"
[features]
default = ["types", "daletpack"]
default = ["types", "daletpack", "parsers"]
parsers = ["dep:chumsky", "types"]
types = []
daletpack = ["types"]

View file

@ -36,6 +36,7 @@ pub enum Tag {
Bl(NotNullBody, AlignArgument),
Carousel(Vec<Tag>),
Code(String, TextOrNullArgument),
Pre(String),
}
pub trait ToDaletl {
@ -78,6 +79,7 @@ impl ToDaletlTag for Tag {
Tag::Bl(b, a) => t_new(Tid::Bl, b.to_daletl_body(), a.to_daletl_argument()),
Tag::Carousel(b) => t_new(Tid::Carousel, b.to_daletl_body(), NA),
Tag::Code(s, a) => t_new(Tid::Code, s.to_daletl_body(), a.to_daletl_argument()),
Tag::Pre(s) => t_new(Tid::Pre, s.to_daletl_body(), NA),
}
}
}

View file

@ -92,4 +92,5 @@ pub enum Tid {
Bl,
Carousel,
Code,
Pre,
}

View file

@ -6,3 +6,6 @@ pub mod abstractions;
#[cfg(feature = "daletpack")]
pub mod daletpack;
#[cfg(feature = "parsers")]
pub mod parsers;

68
src/parsers/gemtext.rs Normal file
View file

@ -0,0 +1,68 @@
use crate::abstractions::{Body, HeadingLevel, NotNullBody, Tag};
#[derive(Debug)]
pub enum GemTextParseError {
InvalidLink,
}
pub fn parse_gemtext(s: String) -> Result<Vec<Tag>, GemTextParseError> {
let mut page: Vec<Tag> = Vec::new();
let mut preformatted = false;
let mut preformatted_text = String::new();
let mut before_is_ordered_list = false;
let mut ordered_list: Vec<Tag> = Vec::new();
for line in s.lines() {
let mut line = line.trim().to_owned();
if before_is_ordered_list && !line.starts_with("* ") {
page.push(Tag::Ul(ordered_list.clone()));
before_is_ordered_list = false;
ordered_list.clear();
} else if preformatted && !line.starts_with("```") {
preformatted_text.push_str(&line);
preformatted_text.push('\n');
} else if line.starts_with("=>") {
let body = line.split_off(2);
let mut body = body.trim().splitn(2, " ");
let url = body.next().ok_or(GemTextParseError::InvalidLink)?.trim();
match body.next() {
Some(label) => page.push(Tag::Link(
Body::Text(label.trim().to_owned()),
url.to_owned(),
)),
None => page.push(Tag::Link(Body::Null, url.to_owned())),
};
} else if line.starts_with("# ") {
let body = line.split_off(2);
page.push(Tag::H(body.trim().to_owned(), HeadingLevel::One));
} else if line.starts_with("## ") {
let body = line.split_off(3);
page.push(Tag::H(body.trim().to_owned(), HeadingLevel::Two));
} else if line.starts_with("### ") {
let body = line.split_off(4);
page.push(Tag::H(body.trim().to_owned(), HeadingLevel::Three));
} else if line.starts_with("* ") {
before_is_ordered_list = true;
let body = line.split_off(2);
ordered_list.push(Tag::El(NotNullBody::Text(body)));
} else if line.starts_with("> ") {
let body = line.split_off(2);
page.push(Tag::Bq(NotNullBody::Text(body)));
} else if line.starts_with("```") {
if preformatted {
page.push(Tag::Pre(preformatted_text.clone()));
preformatted_text.clear();
}
preformatted = !preformatted;
} else if !line.is_empty() {
page.push(Tag::P(NotNullBody::Text(line)));
}
}
Ok(page)
}

1
src/parsers/mod.rs Normal file
View file

@ -0,0 +1 @@
pub mod gemtext;

32
tests/gemtext.gmi Normal file
View file

@ -0,0 +1,32 @@
=> https://example.com A cool website
=> gopher://example.com An even cooler gopherhole
=> gemini://example.com A supremely cool Gemini capsule
=> sftp://example.com
Hi
=>https://example.com A cool website
=>gopher://example.com An even cooler gopherhole
=> gemini://example.com A supremely cool Gemini capsule
=> sftp://example.com
# Heading
## Sub-heading
### Sub-sub-heading
* Mercury
* Gemini
* Apollo
> Gemtext supports blockquotes. The quoted content is written as a single long line, which begins with a single > character
```
preformatted
=> ()
# false heading
text
```
This is paragraph

10
tests/gemtext.rs Normal file
View file

@ -0,0 +1,10 @@
use dalet::parsers::gemtext::parse_gemtext;
#[test]
fn gem_text() {
let text = include_str!("./gemtext.gmi");
let parsed = parse_gemtext(text.to_owned()).unwrap();
println!("{:#?}", parsed);
}