fix: daletpack compresses better if aligns to bytes

This commit is contained in:
Artemy Egorov 2024-08-02 13:18:32 +03:00
parent f6a554e684
commit 04757e69ef
9 changed files with 306 additions and 116 deletions

169
libs/rust/Cargo.lock generated
View file

@ -3,25 +3,68 @@
version = 3
[[package]]
name = "bitvec"
version = "1.0.1"
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "autocfg"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"funty",
"radium",
"tap",
"wyz",
"serde",
]
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cc"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc"
dependencies = [
"jobserver",
"libc",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "crc32fast"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
dependencies = [
"cfg-if",
]
[[package]]
name = "dalet"
version = "1.0.0-pre4"
dependencies = [
"bitvec",
"bincode",
"flate2",
"num_enum",
"rmp-serde",
"serde",
"serde_repr",
"zstd",
]
[[package]]
@ -31,10 +74,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "funty"
version = "2.0.0"
name = "flate2"
version = "1.0.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "hashbrown"
@ -52,12 +99,45 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "jobserver"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
dependencies = [
"libc",
]
[[package]]
name = "libc"
version = "0.2.155"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
dependencies = [
"adler",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "num_enum"
version = "0.7.3"
@ -79,6 +159,18 @@ dependencies = [
"syn",
]
[[package]]
name = "paste"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "pkg-config"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
[[package]]
name = "proc-macro-crate"
version = "3.1.0"
@ -107,10 +199,26 @@ dependencies = [
]
[[package]]
name = "radium"
version = "0.7.0"
name = "rmp"
version = "0.8.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4"
dependencies = [
"byteorder",
"num-traits",
"paste",
]
[[package]]
name = "rmp-serde"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db"
dependencies = [
"byteorder",
"rmp",
"serde",
]
[[package]]
name = "serde"
@ -154,12 +262,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tap"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "toml_datetime"
version = "0.6.8"
@ -193,10 +295,29 @@ dependencies = [
]
[[package]]
name = "wyz"
version = "0.5.1"
name = "zstd"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
dependencies = [
"tap",
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa556e971e7b568dc775c136fc9de8c779b1c2fc3a63defaafadffdbd3181afa"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.12+zstd.1.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13"
dependencies = [
"cc",
"pkg-config",
]

View file

@ -12,10 +12,16 @@ keywords = ["dalet"]
categories = ["compression", "compilers", "encoding"]
[dependencies]
bitvec = "1.0.1"
num_enum = "0.7.3"
serde = { version = "1.0", features = ["derive"] }
serde_repr = "0.1"
zstd = "0.13.2"
[dev-dependencies]
rmp-serde = { version = "1.3.0" }
bincode = { version = "1.3.3" }
flate2 = "1.0"
[features]
default = ["types", "daletpack"]

View file

@ -59,7 +59,7 @@ impl IsNull for Argument {
}
}
#[derive(Serialize_repr, Deserialize_repr, Debug, Clone, PartialEq, Eq, TryFromPrimitive)]
#[derive(Serialize_repr, Deserialize_repr, Debug, Clone, PartialEq, Eq, TryFromPrimitive, Copy)]
#[repr(u8)]
/// Tag Id
pub enum Tid {

View file

@ -1,126 +1,103 @@
use bitvec::{
bits,
order::Msb0,
prelude::{BitVec, Lsb0},
view::{AsBits, BitView},
};
use crate::daletl::{Argument, Body, IsNull, Tag};
use super::utils::*;
use super::DaletPackError;
use super::{DaletPackError, TypeId};
pub fn encode(root: Vec<Tag>) -> Result<Vec<u8>, DaletPackError> {
pub fn encode(root: &Vec<Tag>) -> Result<Vec<u8>, DaletPackError> {
if root.len() > 2usize.pow(32) {
return Err(DaletPackError::RootMaxSizeExceeded);
}
let mut bv: BitVec<u8, Msb0> = BitVec::new();
let mut bv: Vec<u8> = Vec::new();
for tag in root {
write_tag(&mut bv, tag)?;
}
bv.set_uninitialized(false);
Ok(bv.into_vec())
// Ok(zstd::bulk::compress(&bv, 200).map_err(|_| DaletPackError::ZstdCompressError)?)
Ok(bv)
}
fn write_int(bv: &mut BitVec<u8, Msb0>, n: u8) {
if n < 16 {
write_4bit(bv, 0);
write_4bit(bv, n);
} else {
write_4bit(bv, 1);
bv.extend_from_raw_slice(&[n]);
}
fn write_int(bv: &mut Vec<u8>, n: u8) {
bv.push(1);
bv.push(n);
}
fn write_str(bv: &mut BitVec<u8, Msb0>, string: String) -> Result<(), DaletPackError> {
fn write_str(bv: &mut Vec<u8>, string: &String) -> Result<(), DaletPackError> {
let size = string.len();
if size > 2usize.pow(32) {
return Err(DaletPackError::StrMaxSizeExceeded);
}
if size <= 8 {
write_4bit(bv, 2);
write_3bit(bv, (size - 1) as u8);
} else if size <= 16 {
write_4bit(bv, 3);
write_4bit(bv, (size - 1) as u8);
} else if size <= 256 {
write_4bit(bv, 4);
bv.extend_from_raw_slice(&[(size - 1) as u8]);
if size <= 256 {
bv.push(TypeId::Str8 as u8);
bv.push((size - 1) as u8);
} else if size <= 65536 {
write_4bit(bv, 5);
bv.extend_from_bitslice(&((size - 1) as u16).view_bits::<Msb0>());
bv.push(TypeId::Str16 as u8);
bv.extend(((size - 1) as u16).to_be_bytes());
} else {
write_4bit(bv, 6);
bv.extend_from_bitslice(&((size - 1) as u32).view_bits::<Msb0>());
bv.push(TypeId::Str32 as u8);
bv.extend(((size - 1) as u32).to_be_bytes());
}
bv.extend_from_bitslice(&string.as_bits::<Msb0>());
bv.extend_from_slice(string.as_bytes());
Ok(())
}
fn write_array(bv: &mut BitVec<u8, Msb0>, arr: Vec<Tag>) -> Result<(), DaletPackError> {
fn write_array(bv: &mut Vec<u8>, arr: &Vec<Tag>) -> Result<(), DaletPackError> {
if arr.len() > 2usize.pow(32) {
return Err(DaletPackError::ArrMaxSizeExceeded);
}
write_4bit(bv, 7);
bv.push(TypeId::TagArray as u8);
for tag in arr {
write_tag(bv, tag)?;
}
bv.extend_from_bitslice(&bits![1, 0]);
bv.push(TypeId::TagArrayEnd as u8);
Ok(())
}
fn write_tag(bv: &mut BitVec<u8, Msb0>, tag: Tag) -> Result<(), DaletPackError> {
fn write_tag(bv: &mut Vec<u8>, tag: &Tag) -> Result<(), DaletPackError> {
if tag.body.is_null() && tag.argument.is_null() {
write_4bit(bv, 15);
write_tag_id(bv, tag.id as u8);
bv.push(TypeId::TagId as u8);
bv.push(tag.id as u8);
} else if tag.argument.is_null() {
write_4bit(bv, 13);
write_tag_id(bv, tag.id as u8);
write_tag_body(bv, tag.body)?;
bv.push(TypeId::TagIdBody as u8);
bv.push(tag.id as u8);
write_tag_body(bv, &tag.body)?;
} else if tag.body.is_null() {
write_4bit(bv, 14);
write_tag_id(bv, tag.id as u8);
write_tag_argument(bv, tag.argument)?;
bv.push(TypeId::TagIdArgument as u8);
bv.push(tag.id as u8);
write_tag_argument(bv, &tag.argument)?;
} else {
write_4bit(bv, 15);
write_tag_id(bv, tag.id as u8);
write_tag_body(bv, tag.body)?;
write_tag_argument(bv, tag.argument)?;
bv.push(TypeId::TagIdBodyArgument as u8);
bv.push(tag.id as u8);
write_tag_body(bv, &tag.body)?;
write_tag_argument(bv, &tag.argument)?;
}
Ok(())
}
fn write_tag_id(bv: &mut BitVec<u8, Msb0>, n: u8) {
bv.extend_from_bitslice(&n.view_bits::<Msb0>()[3..=7]);
}
fn write_tag_body(bv: &mut BitVec<u8, Msb0>, body: Body) -> Result<(), DaletPackError> {
fn write_tag_body(bv: &mut Vec<u8>, body: &Body) -> Result<(), DaletPackError> {
match body {
Body::Text(s) => write_str(bv, s)?,
Body::Tags(tags) => write_array(bv, tags)?,
Body::Null => unreachable!("This function cannot be called with this value"),
Body::Null => unreachable!("Tag cannot be called with this value"),
};
Ok(())
}
fn write_tag_argument(bv: &mut BitVec<u8, Msb0>, argument: Argument) -> Result<(), DaletPackError> {
fn write_tag_argument(bv: &mut Vec<u8>, argument: &Argument) -> Result<(), DaletPackError> {
match argument {
Argument::Text(s) => write_str(bv, s)?,
Argument::Number(n) => write_int(bv, n),
Argument::Null => unreachable!("This function cannot be called with this value"),
Argument::Number(n) => write_int(bv, *n),
Argument::Null => unreachable!("Tag cannot be called with this value"),
};
Ok(())

View file

@ -1,6 +1,24 @@
use num_enum::TryFromPrimitive;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DaletPackError {
StrMaxSizeExceeded,
ArrMaxSizeExceeded,
RootMaxSizeExceeded,
ZstdCompressError,
}
#[derive(Debug, Clone, PartialEq, Eq, TryFromPrimitive, Copy)]
#[repr(u8)]
pub enum TypeId {
Int8 = 1,
Str8 = 4,
Str16,
Str32,
TagArray,
TagArrayEnd,
TagId,
TagIdBody,
TagIdArgument,
TagIdBodyArgument,
}

View file

@ -1,9 +1,3 @@
use bitvec::{order::Msb0, prelude::BitVec, view::BitView};
pub fn write_3bit(bv: &mut BitVec<u8, Msb0>, n: u8) {
bv.extend_from_bitslice(&n.view_bits::<Msb0>()[5..=7]);
}
pub fn write_4bit(bv: &mut BitVec<u8, Msb0>, n: u8) {
bv.extend_from_bitslice(&n.view_bits::<Msb0>()[4..=7]);
pub fn compress_zstd(data: &Vec<u8>) -> std::io::Result<Vec<u8>> {
zstd::bulk::compress(data, 5)
}

View file

@ -1,21 +1 @@
use std::fs;
use dalet::{
abstractions::{HeadingLevel, Tag, ToDaletl},
daletpack::*,
};
fn main() {
let dalet_page: Vec<Tag> = vec![Tag::H("I am heading".to_owned(), HeadingLevel::One)];
let data = encode(dalet_page.to_daletl()).unwrap();
println!("{:#?}", data);
println!("{}", data.len());
let bits: Vec<_> = data.iter().map(|n| format!("{:b}", n)).collect();
println!("{}", bits.join(""));
// 11010000100111011010010010010000110000101101101001000011010011001010110000101100101101001011011111001111111111
fs::write("./test.daletpack", data).unwrap();
}
fn main() {}

20
libs/rust/tests/bench.md Normal file
View file

@ -0,0 +1,20 @@
# Heading 1
## Heading 2
**Some bold and *italic* ~~text~~**
`Hello world`
- abc
- def
- defabc
- defdef
- xyz
Lorem ipsum [![](https://my-picture)](https://some-link) dolor sit amet consequetur adipiscing elit
|col1|col2|col3|
|:--:|----|---:|
|Never gonna|give you|up|
|Never gonna|let you|down|
|Never gonna|run around|and desert you|
|**abc**|![def](https://some-picture)|*xyz*|

74
libs/rust/tests/bench.rs Normal file
View file

@ -0,0 +1,74 @@
use dalet::{
abstractions::{HeadingLevel, Tag, ToDaletl},
daletpack::*,
};
use flate2::Compression;
use std::io::Write;
#[macro_export]
macro_rules! iprint {
($name:expr, $func:expr) => {{
let start = std::time::Instant::now();
let result = $func;
let elapsed = start.elapsed();
println!("{} ({:#?}): {} bytes", $name, elapsed, result.len());
result
}};
}
pub fn compress_deflate(data: &Vec<u8>) -> std::io::Result<Vec<u8>> {
let mut c = flate2::write::DeflateEncoder::new(Vec::new(), Compression::default());
c.write(data)?;
c.finish()
}
pub fn compress_zlib(data: &Vec<u8>) -> std::io::Result<Vec<u8>> {
let mut c = flate2::write::ZlibEncoder::new(Vec::new(), Compression::default());
c.write(data)?;
c.finish()
}
#[test]
fn bench() {
let mut page: Vec<Tag> = vec![
// Tag::H("I am heading".to_owned(), HeadingLevel::One),
// Tag::H("Heading 2".to_owned(), HeadingLevel::Two),
];
for i in 0..500 {
page.push(Tag::H(format!("{}. Heading", i), HeadingLevel::One))
}
let dalet_page = page.to_daletl();
let daletpack = iprint!("Daletpack", encode(&dalet_page).unwrap());
let messagepack = iprint!("Messagepack", rmp_serde::to_vec(&dalet_page).unwrap());
let bincode = iprint!("Bincode", bincode::serialize(&dalet_page).unwrap());
println!();
iprint!("Daletpack zstd", utils::compress_zstd(&daletpack).unwrap());
iprint!(
"Messagepack zstd",
utils::compress_zstd(&messagepack).unwrap()
);
iprint!("Bincode zstd", utils::compress_zstd(&bincode).unwrap());
println!();
iprint!("Daletpack Zlib", compress_zlib(&daletpack).unwrap());
iprint!("Messagepack Zlib", compress_zlib(&messagepack).unwrap());
iprint!("Bincode Zlib", compress_zlib(&bincode).unwrap());
println!();
iprint!("Daletpack deflate", compress_deflate(&daletpack).unwrap());
iprint!(
"Messagepack deflate",
compress_deflate(&messagepack).unwrap()
);
iprint!("Bincode deflate", compress_deflate(&bincode).unwrap());
// fs::write("./test.daletpack", daletpack).unwrap();
}