C grammar for the R treesitter package.
Installation
# Install the development version of treesitter.c from r-universe
install.packages('treesitter.c', repos = c('https://sounkou-bioinfo.r-universe.dev', 'https://cloud.r-project.org'))
# or the cran release
install.packages('treesitter.c', repos = 'https://cloud.r-project.org')Usage
library(treesitter)
#>
#> Attaching package: 'treesitter'
#> The following object is masked from 'package:base':
#>
#> range
library(treesitter.c)
c_language <- language()
parser <- parser(c_language)
code <- "
struct Point {
int x[MAX_SIZE];
int y;
};
"
tree <- parser_parse(parser, code)
tree
#> <tree_sitter_tree>
#>
#> ── Text ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> struct Point {
#> int x[MAX_SIZE];
#> int y;
#> };
#>
#>
#> ── S-Expression ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> (translation_unit [(1, 0), (5, 0)]
#> (struct_specifier [(1, 0), (4, 1)]
#> "struct" [(1, 0), (1, 6)]
#> name: (type_identifier [(1, 7), (1, 12)])
#> body: (field_declaration_list [(1, 13), (4, 1)]
#> "{" [(1, 13), (1, 14)]
#> (field_declaration [(2, 2), (2, 18)]
#> type: (primitive_type [(2, 2), (2, 5)])
#> declarator: (array_declarator [(2, 6), (2, 17)]
#> declarator: (field_identifier [(2, 6), (2, 7)])
#> "[" [(2, 7), (2, 8)]
#> size: (identifier [(2, 8), (2, 16)])
#> "]" [(2, 16), (2, 17)]
#> )
#> ";" [(2, 17), (2, 18)]
#> )
#> (field_declaration [(3, 2), (3, 8)]
#> type: (primitive_type [(3, 2), (3, 5)])
#> declarator: (field_identifier [(3, 6), (3, 7)])
#> ";" [(3, 7), (3, 8)]
#> )
#> "}" [(4, 0), (4, 1)]
#> )
#> )
#> ";" [(4, 1), (4, 2)]
#> <truncated>Preprocessing and header parsing
If you have a C compiler available and want to preprocess macros (recommended for headers that use macros), enable preprocess = TRUE. Prefer to use the helper r_cc() to detect the compiler automatically
# Check for a compiler and use include_dirs so the preprocessor can find nested headers
cc <- treesitter.c::r_cc()
hdr_df_pp <- parse_r_include_headers(
dir = R.home("include"),
preprocess = TRUE,
include_dirs = R.home("include")
)
hdr_df_pp[grepl("Rf", x = hdr_df_pp$name), ] |> head(10)
#> name file line kind
#> 1483 Rf_error /usr/share/R/include/R_ext/Callbacks.h 2522 declaration
#> 1486 Rf_warning /usr/share/R/include/R_ext/Callbacks.h 2528 declaration
#> 1495 Rf_revsort /usr/share/R/include/R_ext/Callbacks.h 2567 declaration
#> 1496 Rf_iPsort /usr/share/R/include/R_ext/Callbacks.h 2568 declaration
#> 1497 Rf_rPsort /usr/share/R/include/R_ext/Callbacks.h 2569 declaration
#> 1498 Rf_cPsort /usr/share/R/include/R_ext/Callbacks.h 2570 declaration
#> 1503 Rf_StringFalse /usr/share/R/include/R_ext/Callbacks.h 2586 declaration
#> 1504 Rf_StringTrue /usr/share/R/include/R_ext/Callbacks.h 2587 declaration
#> 1505 Rf_isBlankString /usr/share/R/include/R_ext/Callbacks.h 2588 declaration
#> 1557 Rf_asChar /usr/share/R/include/R_ext/Callbacks.h 2922 declarationYou can use the preprocess_header function with extra compiler options to avoid system includes and use the bundled fake libc headers. This avoids system includes bloat.
# Path to a header file to preprocess
header_file <- file.path(R.home("include"), "Rinternals.h")
# Get the path to the fake libc headers
fake_libc <- fake_libc_path()
# Preprocess with -nostdinc and -I pointing to fake_libc
preprocessed <- preprocess_header(
file = header_file,
cc = r_cc(),
ccflags = paste0("-I", fake_libc),
"-nostdinc"
)
cat(substr(preprocessed, 1, 500))
#> # 0 "/usr/share/R/include/Rinternals.h"
#> # 0 "<built-in>"
#> # 0 "<command-line>"
#> # 1 "/usr/share/R/include/Rinternals.h"
#> # 38 "/usr/share/R/include/Rinternals.h"
#> # 1 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/stdio.h" 1
#> # 1 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/_fake_defines.h" 1
#> # 2 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/stdio.h" 2
#> # 1 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/_fake_typedefs.h" 1
#>
#>
#>
#> typedef int size_t;
#> typedef int __builtin_va_This approach ensures only the fake libc headers are used, making preprocessing more predictable and portable.
Parsing examples
The following concise examples demonstrate extracting specific information (functions, parameters, structs, macros) using the package’s simple helpers.
Simple parse and extract functions: parse a small header string and extract functions with parameter types.
txt <- "int foo(int a, const char* s);
static inline int bar(void) { return 1; }"
# extract params and return type
root <- parse_header_text(txt)
get_function_nodes(root, extract_params = TRUE, extract_return = TRUE)
#> capture_name text start_line start_col params return_type
#> 1 decl_name foo 1 5 int, con.... int
#> 2 def_name bar 2 19 void int
get_function_nodes(root, extract_params = TRUE)
#> capture_name text start_line start_col params return_type
#> 1 decl_name foo 1 5 int, con.... <NA>
#> 2 def_name bar 2 19 void <NA>Extract function parameter and return types while parsing:
txt <- "int foo(int a, const char* s);"
root <- parse_header_text(txt)
get_function_nodes(root, extract_params = TRUE, extract_return = TRUE)
#> capture_name text start_line start_col params return_type
#> 1 decl_name foo 1 5 int, con.... intGet structs and members:
txt <- "struct T { unsigned int x:1; int y; };"
root <- parse_header_text(txt)
get_struct_nodes(root)
#> capture_name text start_line
#> 1 struct_name T 1
get_struct_members(root)
#> struct_name member_name member_type bitfield nested_members
#> 1 T x <NA> 1 <NA>
#> 2 T y int <NA> <NA>Collect a directory with all kinds using parse_headers_collect
res <- parse_headers_collect(dir = R.home("include"), preprocess = FALSE, extract_params = TRUE)
names(res)
#> [1] "functions" "structs" "struct_members" "enums"
#> [5] "unions" "globals" "defines"
head(res$functions)
#> file capture_name start_line start_col
#> 1 /usr/share/R/include/R_ext/Altrep.h decl_name 47 1
#> 2 /usr/share/R/include/R_ext/Altrep.h decl_name 50 1
#> 3 /usr/share/R/include/R_ext/Altrep.h decl_name 52 1
#> 4 /usr/share/R/include/R_ext/Altrep.h decl_name 54 1
#> 5 /usr/share/R/include/R_ext/Altrep.h decl_name 56 1
#> 6 /usr/share/R/include/R_ext/Altrep.h decl_name 58 1
#> params return_type name
#> 1 R_altrep.... <NA> R_new_altrep
#> 2 const ch.... <NA> R_make_altstring_class
#> 3 const ch.... <NA> R_make_altinteger_class
#> 4 const ch.... <NA> R_make_altreal_class
#> 5 const ch.... <NA> R_make_altlogical_class
#> 6 const ch.... <NA> R_make_altraw_class
# Optional: inspect macros from a single header
path <- file.path(R.home("include"), "Rembedded.h")
defs <- get_defines_from_file(path, use_cpp = TRUE, ccflags = paste("-I", dirname(path)))
head(defs)
#> [1] "__DBL_MIN_EXP__" "__UINT_LEAST16_MAX__"
#> [3] "_STDBOOL_H" "__FLT16_HAS_QUIET_NAN__"
#> [5] "__ATOMIC_ACQUIRE" "__FLT128_MAX_10_EXP__"Details On the Used Grammar
treesiter ABI Version 14, compatible with treesitter package version 0.3.0. The C grammar source used for bootstrapping was downloaded from https://github.com/tree-sitter/tree-sitter-c. The pre-generated parser.c from upstream is ~3.7 MB and contains pragma directives that trigger CRAN check warnings.
During bootstrap (bootstrap.R), all #pragma directives are automatically removed from parser.c to ensure CRAN compliance. This includes pragmas for diagnostic control and optimization settings that are not portable across compilers.