migrate grammar fetching/building code into helix-loader crate

This is a rather large refactor that moves most of the code for
loading, fetching, and building grammars into a new helix-loader
module. This works well with the [[grammars]] syntax for
languages.toml defined earlier: we only have to depend on the types
for GrammarConfiguration in helix-loader and can leave all the
[[language]] entries for helix-core.
This commit is contained in:
Michael Davis 2022-02-16 07:57:20 -06:00 committed by Blaž Hrastnik
parent 08ee949dcb
commit 4fc991fdec
23 changed files with 419 additions and 374 deletions

23
helix-loader/Cargo.toml Normal file
View file

@ -0,0 +1,23 @@
[package]
name = "helix-loader"
version = "0.6.0"
description = "A post-modern text editor."
authors = ["Blaž Hrastnik <blaz@mxxn.io>"]
edition = "2021"
license = "MPL-2.0"
categories = ["editor"]
repository = "https://github.com/helix-editor/helix"
homepage = "https://helix-editor.com"
[dependencies]
anyhow = "1"
serde = { version = "1.0", features = ["derive"] }
toml = "0.5"
etcetera = "0.3"
tree-sitter = "0.20"
libloading = "0.7"
once_cell = "1.9"
# cloning/compiling tree-sitter grammars
cc = { version = "1" }
threadpool = { version = "1.0" }

6
helix-loader/build.rs Normal file
View file

@ -0,0 +1,6 @@
fn main() {
println!(
"cargo:rustc-env=BUILD_TARGET={}",
std::env::var("TARGET").unwrap()
);
}

388
helix-loader/src/grammar.rs Normal file
View file

@ -0,0 +1,388 @@
use anyhow::{anyhow, Context, Result};
use libloading::{Library, Symbol};
use serde::{Deserialize, Serialize};
use std::fs;
use std::time::SystemTime;
use std::{
collections::HashSet,
path::{Path, PathBuf},
process::Command,
sync::mpsc::channel,
};
use tree_sitter::Language;
#[cfg(unix)]
const DYLIB_EXTENSION: &str = "so";
#[cfg(windows)]
const DYLIB_EXTENSION: &str = "dll";
#[derive(Debug, Serialize, Deserialize)]
struct Configuration {
#[serde(rename = "use-grammars")]
pub grammar_selection: Option<GrammarSelection>,
pub grammar: Vec<GrammarConfiguration>,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", untagged)]
pub enum GrammarSelection {
Only(HashSet<String>),
Except(HashSet<String>),
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct GrammarConfiguration {
#[serde(rename = "name")]
pub grammar_id: String,
pub source: GrammarSource,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", untagged)]
pub enum GrammarSource {
Local {
path: String,
},
Git {
#[serde(rename = "git")]
remote: String,
#[serde(rename = "rev")]
revision: String,
subpath: Option<String>,
},
}
const BUILD_TARGET: &str = env!("BUILD_TARGET");
const REMOTE_NAME: &str = "origin";
pub fn get_language(name: &str) -> Result<Language> {
let name = name.to_ascii_lowercase();
let mut library_path = crate::runtime_dir().join("grammars").join(&name);
library_path.set_extension(DYLIB_EXTENSION);
let library = unsafe { Library::new(&library_path) }
.with_context(|| format!("Error opening dynamic library {library_path:?}"))?;
let language_fn_name = format!("tree_sitter_{}", name.replace('-', "_"));
let language = unsafe {
let language_fn: Symbol<unsafe extern "C" fn() -> Language> = library
.get(language_fn_name.as_bytes())
.with_context(|| format!("Failed to load symbol {language_fn_name}"))?;
language_fn()
};
std::mem::forget(library);
Ok(language)
}
pub fn fetch_grammars() -> Result<()> {
run_parallel(get_grammar_configs()?, fetch_grammar, "fetch")
}
pub fn build_grammars() -> Result<()> {
run_parallel(get_grammar_configs()?, build_grammar, "build")
}
// Returns the set of grammar configurations the user requests.
// Grammars are configured in the default and user `languages.toml` and are
// merged. The `grammar_selection` key of the config is then used to filter
// down all grammars into a subset of the user's choosing.
fn get_grammar_configs() -> Result<Vec<GrammarConfiguration>> {
let config: Configuration = crate::user_lang_config()
.context("Could not parse languages.toml")?
.try_into()?;
let grammars = match config.grammar_selection {
Some(GrammarSelection::Only(selections)) => config
.grammar
.into_iter()
.filter(|grammar| selections.contains(&grammar.grammar_id))
.collect(),
Some(GrammarSelection::Except(rejections)) => config
.grammar
.into_iter()
.filter(|grammar| !rejections.contains(&grammar.grammar_id))
.collect(),
None => config.grammar,
};
Ok(grammars)
}
fn run_parallel<F>(grammars: Vec<GrammarConfiguration>, job: F, action: &'static str) -> Result<()>
where
F: Fn(GrammarConfiguration) -> Result<()> + std::marker::Send + 'static + Copy,
{
let pool = threadpool::Builder::new().build();
let (tx, rx) = channel();
for grammar in grammars {
let tx = tx.clone();
pool.execute(move || {
tx.send(job(grammar)).unwrap();
});
}
pool.join();
// TODO: print all failures instead of the first one found.
if let Some(failure) = rx.try_iter().find_map(|result| result.err()) {
Err(anyhow!(
"Failed to {} some grammar(s).\n{}",
action,
failure
))
} else {
Ok(())
}
}
fn fetch_grammar(grammar: GrammarConfiguration) -> Result<()> {
if let GrammarSource::Git {
remote, revision, ..
} = grammar.source
{
let grammar_dir = crate::runtime_dir()
.join("grammars/sources")
.join(&grammar.grammar_id);
fs::create_dir_all(&grammar_dir).context(format!(
"Could not create grammar directory {:?}",
grammar_dir
))?;
// create the grammar dir contains a git directory
if !grammar_dir.join(".git").is_dir() {
git(&grammar_dir, ["init"])?;
}
// ensure the remote matches the configured remote
if get_remote_url(&grammar_dir).map_or(true, |s| s != remote) {
set_remote(&grammar_dir, &remote)?;
}
// ensure the revision matches the configured revision
if get_revision(&grammar_dir).map_or(true, |s| s != revision) {
// Fetch the exact revision from the remote.
// Supported by server-side git since v2.5.0 (July 2015),
// enabled by default on major git hosts.
git(&grammar_dir, ["fetch", REMOTE_NAME, &revision])?;
git(&grammar_dir, ["checkout", &revision])?;
println!(
"Grammar '{}' checked out at '{}'.",
grammar.grammar_id, revision
);
Ok(())
} else {
println!("Grammar '{}' is already up to date.", grammar.grammar_id);
Ok(())
}
} else {
println!("Skipping local grammar '{}'", grammar.grammar_id);
Ok(())
}
}
// Sets the remote for a repository to the given URL, creating the remote if
// it does not yet exist.
fn set_remote(repository_dir: &Path, remote_url: &str) -> Result<String> {
git(
repository_dir,
["remote", "set-url", REMOTE_NAME, remote_url],
)
.or_else(|_| git(repository_dir, ["remote", "add", REMOTE_NAME, remote_url]))
}
fn get_remote_url(repository_dir: &Path) -> Option<String> {
git(repository_dir, ["remote", "get-url", REMOTE_NAME]).ok()
}
fn get_revision(repository_dir: &Path) -> Option<String> {
git(repository_dir, ["rev-parse", "HEAD"]).ok()
}
// A wrapper around 'git' commands which returns stdout in success and a
// helpful error message showing the command, stdout, and stderr in error.
fn git<I, S>(repository_dir: &Path, args: I) -> Result<String>
where
I: IntoIterator<Item = S>,
S: AsRef<std::ffi::OsStr>,
{
let output = Command::new("git")
.args(args)
.current_dir(repository_dir)
.output()?;
if output.status.success() {
Ok(String::from_utf8_lossy(&output.stdout)
.trim_end()
.to_owned())
} else {
// TODO: figure out how to display the git command using `args`
Err(anyhow!(
"Git command failed.\nStdout: {}\nStderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr),
))
}
}
fn build_grammar(grammar: GrammarConfiguration) -> Result<()> {
println!("{:#?}", grammar);
let grammar_dir = if let GrammarSource::Local { path } = &grammar.source {
PathBuf::from(&path)
} else {
crate::runtime_dir()
.join("grammars/sources")
.join(&grammar.grammar_id)
};
let grammar_dir_entries = grammar_dir.read_dir().with_context(|| {
format!("Failed to read directory {grammar_dir:?}. Did you use 'hx --fetch-grammars'?")
})?;
if grammar_dir_entries.count() == 0 {
return Err(anyhow!(
"Directory {grammar_dir:?} is empty. Did you use 'hx --fetch-grammars'?"
));
};
let path = match &grammar.source {
GrammarSource::Git {
subpath: Some(subpath),
..
} => grammar_dir.join(subpath),
_ => grammar_dir,
}
.join("src");
build_tree_sitter_library(&path, grammar)
}
fn build_tree_sitter_library(src_path: &Path, grammar: GrammarConfiguration) -> Result<()> {
let header_path = src_path;
let parser_path = src_path.join("parser.c");
let mut scanner_path = src_path.join("scanner.c");
let scanner_path = if scanner_path.exists() {
Some(scanner_path)
} else {
scanner_path.set_extension("cc");
if scanner_path.exists() {
Some(scanner_path)
} else {
None
}
};
let parser_lib_path = crate::runtime_dir().join("grammars");
let mut library_path = parser_lib_path.join(&grammar.grammar_id);
library_path.set_extension(DYLIB_EXTENSION);
let recompile = needs_recompile(&library_path, &parser_path, &scanner_path)
.context("Failed to compare source and binary timestamps")?;
if !recompile {
println!("Grammar '{}' is already built.", grammar.grammar_id);
return Ok(());
}
println!("Building grammar '{}'", grammar.grammar_id);
let mut config = cc::Build::new();
config
.cpp(true)
.opt_level(3)
.cargo_metadata(false)
.host(BUILD_TARGET)
.target(BUILD_TARGET);
let compiler = config.get_compiler();
let mut command = Command::new(compiler.path());
command.current_dir(src_path);
for (key, value) in compiler.env() {
command.env(key, value);
}
if cfg!(windows) {
command
.args(&["/nologo", "/LD", "/I"])
.arg(header_path)
.arg("/Od")
.arg("/utf-8");
if let Some(scanner_path) = scanner_path.as_ref() {
command.arg(scanner_path);
}
command
.arg(parser_path)
.arg("/link")
.arg(format!("/out:{}", library_path.to_str().unwrap()));
} else {
command
.arg("-shared")
.arg("-fPIC")
.arg("-fno-exceptions")
.arg("-g")
.arg("-I")
.arg(header_path)
.arg("-o")
.arg(&library_path)
.arg("-O3");
if let Some(scanner_path) = scanner_path.as_ref() {
if scanner_path.extension() == Some("c".as_ref()) {
command.arg("-xc").arg("-std=c99").arg(scanner_path);
} else {
command.arg(scanner_path);
}
}
command.arg("-xc").arg(parser_path);
if cfg!(all(unix, not(target_os = "macos"))) {
command.arg("-Wl,-z,relro,-z,now");
}
}
let output = command.output().context("Failed to execute C compiler")?;
if !output.status.success() {
return Err(anyhow!(
"Parser compilation failed.\nStdout: {}\nStderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
));
}
Ok(())
}
fn needs_recompile(
lib_path: &Path,
parser_c_path: &Path,
scanner_path: &Option<PathBuf>,
) -> Result<bool> {
if !lib_path.exists() {
return Ok(true);
}
let lib_mtime = mtime(lib_path)?;
if mtime(parser_c_path)? > lib_mtime {
return Ok(true);
}
if let Some(scanner_path) = scanner_path {
if mtime(scanner_path)? > lib_mtime {
return Ok(true);
}
}
Ok(false)
}
fn mtime(path: &Path) -> Result<SystemTime> {
Ok(fs::metadata(path)?.modified()?)
}
/// Gives the contents of a file from a language's `runtime/queries/<lang>`
/// directory
pub fn load_runtime_file(language: &str, filename: &str) -> Result<String, std::io::Error> {
let path = crate::RUNTIME_DIR
.join("queries")
.join(language)
.join(filename);
std::fs::read_to_string(&path)
}

161
helix-loader/src/lib.rs Normal file
View file

@ -0,0 +1,161 @@
pub mod grammar;
use etcetera::base_strategy::{choose_base_strategy, BaseStrategy};
pub static RUNTIME_DIR: once_cell::sync::Lazy<std::path::PathBuf> =
once_cell::sync::Lazy::new(runtime_dir);
pub fn runtime_dir() -> std::path::PathBuf {
if let Ok(dir) = std::env::var("HELIX_RUNTIME") {
return dir.into();
}
const RT_DIR: &str = "runtime";
let conf_dir = config_dir().join(RT_DIR);
if conf_dir.exists() {
return conf_dir;
}
if let Ok(dir) = std::env::var("CARGO_MANIFEST_DIR") {
// this is the directory of the crate being run by cargo, we need the workspace path so we take the parent
return std::path::PathBuf::from(dir).parent().unwrap().join(RT_DIR);
}
// fallback to location of the executable being run
std::env::current_exe()
.ok()
.and_then(|path| path.parent().map(|path| path.to_path_buf().join(RT_DIR)))
.unwrap()
}
pub fn config_dir() -> std::path::PathBuf {
// TODO: allow env var override
let strategy = choose_base_strategy().expect("Unable to find the config directory!");
let mut path = strategy.config_dir();
path.push("helix");
path
}
pub fn cache_dir() -> std::path::PathBuf {
// TODO: allow env var override
let strategy = choose_base_strategy().expect("Unable to find the config directory!");
let mut path = strategy.cache_dir();
path.push("helix");
path
}
pub fn config_file() -> std::path::PathBuf {
config_dir().join("config.toml")
}
pub fn lang_config_file() -> std::path::PathBuf {
config_dir().join("languages.toml")
}
pub fn log_file() -> std::path::PathBuf {
cache_dir().join("helix.log")
}
/// Default bultin-in languages.toml.
pub fn default_lang_config() -> toml::Value {
toml::from_slice(include_bytes!("../../languages.toml"))
.expect("Could not parse bultin-in languages.toml to valid toml")
}
/// User configured languages.toml file, merged with the default config.
pub fn user_lang_config() -> Result<toml::Value, toml::de::Error> {
let def_lang_conf = default_lang_config();
let data = std::fs::read(crate::config_dir().join("languages.toml"));
let user_lang_conf = match data {
Ok(raw) => {
let value = toml::from_slice(&raw)?;
merge_toml_values(def_lang_conf, value)
}
Err(_) => def_lang_conf,
};
Ok(user_lang_conf)
}
// right overrides left
pub fn merge_toml_values(left: toml::Value, right: toml::Value) -> toml::Value {
use toml::Value;
fn get_name(v: &Value) -> Option<&str> {
v.get("name").and_then(Value::as_str)
}
match (left, right) {
(Value::Array(mut left_items), Value::Array(right_items)) => {
left_items.reserve(right_items.len());
for rvalue in right_items {
let lvalue = get_name(&rvalue)
.and_then(|rname| left_items.iter().position(|v| get_name(v) == Some(rname)))
.map(|lpos| left_items.remove(lpos));
let mvalue = match lvalue {
Some(lvalue) => merge_toml_values(lvalue, rvalue),
None => rvalue,
};
left_items.push(mvalue);
}
Value::Array(left_items)
}
(Value::Table(mut left_map), Value::Table(right_map)) => {
for (rname, rvalue) in right_map {
match left_map.remove(&rname) {
Some(lvalue) => {
let merged_value = merge_toml_values(lvalue, rvalue);
left_map.insert(rname, merged_value);
}
None => {
left_map.insert(rname, rvalue);
}
}
}
Value::Table(left_map)
}
// Catch everything else we didn't handle, and use the right value
(_, value) => value,
}
}
#[cfg(test)]
mod merge_toml_tests {
use super::merge_toml_values;
#[test]
fn language_tomls() {
use toml::Value;
const USER: &str = "
[[language]]
name = \"nix\"
test = \"bbb\"
indent = { tab-width = 4, unit = \" \", test = \"aaa\" }
";
let base: Value = toml::from_slice(include_bytes!("../../languages.toml"))
.expect("Couldn't parse built-in languages config");
let user: Value = toml::from_str(USER).unwrap();
let merged = merge_toml_values(base, user);
let languages = merged.get("language").unwrap().as_array().unwrap();
let nix = languages
.iter()
.find(|v| v.get("name").unwrap().as_str().unwrap() == "nix")
.unwrap();
let nix_indent = nix.get("indent").unwrap();
// We changed tab-width and unit in indent so check them if they are the new values
assert_eq!(
nix_indent.get("tab-width").unwrap().as_integer().unwrap(),
4
);
assert_eq!(nix_indent.get("unit").unwrap().as_str().unwrap(), " ");
// We added a new keys, so check them
assert_eq!(nix.get("test").unwrap().as_str().unwrap(), "bbb");
assert_eq!(nix_indent.get("test").unwrap().as_str().unwrap(), "aaa");
// We didn't change comment-token so it should be same
assert_eq!(nix.get("comment-token").unwrap().as_str().unwrap(), "#");
}
}