RGenomeTester4 provides:
- processx wrappers around bundled
GenomeTester4binaries - wrappers for bundled utility scripts
- stateless
NIPTmer-style scoring utilities - reference-building utilities for both paper-style and population-adapted workflows
The package focuses on reproducible file-based k-mer workflows in R while keeping the heavy k-mer operations in the proven GenomeTester4 tools.
Installation
You can install the development version from GitHub with:
# install.packages("pak")
pak::pak("sounkou-bioinfo/RGenomeTester4")Quick Start
library(RGenomeTester4)
# Locate bundled binaries
gt4_binary_path("glistmaker")
#> [1] "/usr/local/lib/R/site-library/RGenomeTester4/bin/glistmaker"
# ULCWGS-oriented defaults for reference building
niptmer_ref_profile_ulcwgs()
#> $vcf_engine
#> [1] "legacy_compatible"
#>
#> $pool_mode
#> [1] "block"
#>
#> $model
#> [1] "nbinom"
#>
#> $lower_p
#> [1] 0.01
#>
#> $upper_p
#> [1] 0.99
#>
#> $block_size
#> [1] 50
#>
#> $min_block_support
#> [1] 0.7
#>
#> $distribution_max
#> [1] 500Build Reference Panels
There are two main build strategies:
-
paper_plus_population: start from an existing chromosome panel and apply your population filters. -
from_scratch: generate chromosome lists from FASTA and then apply filters.
1) Paper + Population (recommended when you already have a baseline panel)
library(RGenomeTester4)
td <- tempfile("rgenometester4-readme-")
dir.create(td)
ref_dir <- file.path(td, "ref")
dir.create(ref_dir)
chr1_fa <- file.path(ref_dir, "1.fa")
chr2_fa <- file.path(ref_dir, "2.fa")
writeLines(c(
">1",
"ACGTTGCAAGTCCTAGGATCCTTACGATCGTA"
), chr1_fa)
writeLines(c(
">2",
"TTGACCGATTAACCGGTTAGCATGGCATTAAG"
), chr2_fa)
base_dir <- file.path(td, "base_panel")
dir.create(base_dir)
stopifnot(glistmaker(c(chr1_fa, "-w", "5", "-o", file.path(base_dir, "1")))$status == 0L)
stopifnot(glistmaker(c(chr2_fa, "-w", "5", "-o", file.path(base_dir, "2")))$status == 0L)
genome_fa <- file.path(td, "genome.fa")
writeLines(c(
">1",
"ACGTTGCAAGTCCTAGGATCCTTACGATCGTA",
">2",
"TTGACCGATTAACCGGTTAGCATGGCATTAAG"
), genome_fa)
vcf_path <- file.path(td, "1.vcf")
writeLines(c(
"##fileformat=VCFv4.2",
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
"1\t6\trs1\tG\tA\t.\t.\tCAF=0.90,0.10"
), vcf_path)
bed_path <- file.path(td, "regions.bed")
writeLines(c("1\t0\t8", "2\t10\t20"), bed_path)
ctl1_fa <- file.path(td, "ctl1.fa")
ctl2_fa <- file.path(td, "ctl2.fa")
writeLines(c(
">ctl1",
"ACGTTGCAAGTCCTAGGATCCTTACGATCGTAACGTTGCAA"
), ctl1_fa)
writeLines(c(
">ctl2",
"TTGACCGATTAACCGGTTAGCATGGCATTAAGTTGACCGAT"
), ctl2_fa)
stopifnot(glistmaker(c(ctl1_fa, "-w", "5", "-o", file.path(td, "ctl1")))$status == 0L)
stopifnot(glistmaker(c(ctl2_fa, "-w", "5", "-o", file.path(td, "ctl2")))$status == 0L)
control_lists <- c(file.path(td, "ctl1_5.list"), file.path(td, "ctl2_5.list"))
profile <- niptmer_ref_profile_ulcwgs()
profile$block_size <- 2
panel <- niptmer_ref_build(
strategy = "paper_plus_population",
base_panel_dir = base_dir,
reference_fasta_dir = ref_dir,
genome_fasta = genome_fa,
pop_vcf_files = vcf_path,
pop_bed_files = bed_path,
control_lists = control_lists,
k = 5,
chr_ids = c("1", "2"),
profile = profile,
out_dir = file.path(td, "paper_plus")
)
niptmer_ref_qc(panel)
#> chr
#> 1 1
#> 2 2
#> list_path
#> 1 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/paper_plus/panel_stable/1_stable_5_intrsec.list
#> 2 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/paper_plus/panel_stable/2_stable_5_intrsec.list
#> n_kmers gc
#> 1 15 0.466667
#> 2 20 0.4400002) From Scratch
panel <- niptmer_ref_build(
strategy = "from_scratch",
reference_fasta_dir = ref_dir,
genome_fasta = genome_fa,
pop_vcf_files = vcf_path,
control_lists = control_lists,
k = 5,
chr_ids = c("1", "2"),
vcf_engine = "legacy_compatible",
model = "nbinom",
pool_mode = "block",
block_size = 2,
min_block_support = 0.70,
out_dir = file.path(td, "from_scratch")
)
niptmer_ref_qc(panel)
#> chr
#> 1 1
#> 2 2
#> list_path
#> 1 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/from_scratch/panel_stable/1_stable_5_intrsec.list
#> 2 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/from_scratch/panel_stable/2_stable_5_intrsec.list
#> n_kmers gc
#> 1 0 NA
#> 2 0 NA
# Note: with strict uniqueness filtering (`-c 2` behavior inherited from
# GenomeTester4 workflow), tiny synthetic examples can yield very small or
# even empty final lists. Use realistic chromosome-scale references in practice.VCF Engine Modes
niptmer_ref_blacklist_from_vcf() supports two expansion modes:
-
legacy_compatible(default): overlap-aware expansion that approximates the legacyhapl_generator.plflow. -
simple: independent per-variant expansion.
bl <- niptmer_ref_blacklist_from_vcf(
vcf_files = vcf_path,
reference_fasta_dir = ref_dir,
k = 5,
min_af = 0.01,
engine = "legacy_compatible",
out_dir = file.path(td, "vcf_mode")
)
bl
#> $path
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/vcf_mode/pop_snp_blacklist_5.list"
#>
#> $k
#> [1] 5
#>
#> $engine
#> [1] "legacy_compatible"
#>
#> $variants_seen
#> [1] 1
#>
#> $variant_alleles_used
#> [1] 1
#>
#> $unique_kmers
#> [1] 10Score a K-mer Table
library(RGenomeTester4)
counts_path <- system.file("extdata", "niptmer_counts_example.tsv", package = "RGenomeTester4")
controls_path <- system.file("extdata", "niptmer_controls_example.tsv", package = "RGenomeTester4")
counts <- niptmer_read_table(counts_path)
controls <- readLines(controls_path, warn = FALSE)
scores <- niptmer_score_samples(
count_table = counts,
control = controls,
normalize = TRUE,
reference_mode = "fixed"
)
calls <- niptmer_interpret(scores)
head(calls)
#> SampleName AA SA Distance Gender
#> 1 2NV9 XY Good boyFile-Oriented NIPTmer Steps
The package now exposes the legacy niptmer segment as composable file-oriented utilities.
counts_path <- system.file("extdata", "niptmer_counts_example.tsv", package = "RGenomeTester4")
controls_path <- system.file("extdata", "niptmer_controls_example.tsv", package = "RGenomeTester4")
counts <- niptmer_read_table(counts_path)
reference_counts <- counts[-nrow(counts), , drop = FALSE]
sample_row <- counts[nrow(counts), , drop = FALSE]
artifact_dir <- file.path(td, "niptmer-artifacts")
dir.create(artifact_dir, recursive = TRUE, showWarnings = FALSE)
combined_counts <- niptmer_bind_count_table(
reference_table = reference_counts,
sample_rows = sample_row,
out_file = file.path(artifact_dir, "combined_counts.tsv")
)
zscores <- niptmer_write_zscore(
count_table = combined_counts,
control = controls_path,
normalize = TRUE,
out_file = file.path(artifact_dir, "sample_zscore.txt")
)
xya <- niptmer_write_xya(
zscores,
out_file = file.path(artifact_dir, "sample_XYA_result.tsv")
)
xya
#> SampleName AA SA Distance Gender
#> 1 2NV9 XY Good boyFASTQ -> .list and .list -> count row are available separately when you want to keep orchestration outside the package. If you pass paired FASTQs, the listing helper concatenates them internally before glistmaker:
sample_r1_fastq <- file.path(td, "sample_R1.fastq")
sample_r2_fastq <- file.path(td, "sample_R2.fastq")
writeLines(c("@sample-r1", "ACGTACGT", "+", "FFFFFFFF"), sample_r1_fastq)
writeLines(c("@sample-r2", "GGGGTTTT", "+", "FFFFFFFF"), sample_r2_fastq)
chr_dir <- file.path(td, "chr_lists")
dir.create(chr_dir, recursive = TRUE, showWarnings = FALSE)
chr1_fastq <- file.path(td, "chr1.fastq")
chrX_fastq <- file.path(td, "chrX.fastq")
writeLines(c("@chr1-read", "ACGTAC", "+", "FFFFFF"), chr1_fastq)
writeLines(c("@chrX-read", "TTTTTT", "+", "FFFFFF"), chrX_fastq)
sample_list <- niptmer_make_sample_list(
c(sample_r1_fastq, sample_r2_fastq),
out_prefix = file.path(td, "sample"),
k = 3,
threads = 1
)
niptmer_make_sample_list(chr1_fastq, file.path(chr_dir, "1_cleaned"), k = 3, threads = 1)
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/chr_lists/1_cleaned_3.list"
niptmer_make_sample_list(chrX_fastq, file.path(chr_dir, "X_cleaned"), k = 3, threads = 1)
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/chr_lists/X_cleaned_3.list"
niptmer_count_sample_list(
sample_list = sample_list,
chr_list_dir = chr_dir,
chromosomes = c("1", "X"),
chr_suffix = "_cleaned_3.list"
)
#> CHR TOTAL UNIQUE GC 1 X
#> 1 sample 12 6 0.527778 2 1For a thin end-to-end replacement of the legacy niptmer block, use niptmer_run_sample():
niptmer_run_sample(
fastq_files = c("sample_R1.fastq.gz", "sample_R2.fastq.gz"),
chr_list_dir = "panel_lists",
reference_table = "reference_counts.tsv",
control = "reference_controls.tsv",
out_dir = "sample_niptmer",
sample_name = "Sample123",
k = 25,
threads = 8,
chr_suffix = "_cleaned_25.list",
normalize = TRUE,
interpret_args = list(
sex_call_spec = list(boy = list(euploid_abs = 3, call = 3.5))
)
)Processx Output Safety
For heavy commands, avoid collecting large output in memory unless you need to parse it.
- Default wrappers work well for normal runs.
- If you expect large output, send logs to files via
stdout = "path.log",stderr = "path.err". - When parsing machine-readable output (
--distribution,--count_only,--gc), use pipe capture (stdout = "|") intentionally.
tiny_fa <- file.path(td, "tiny.fa")
writeLines(c(
">tiny",
"ACGTTGCAAGTCCTAGGATCCTTACGATCGTA"
), tiny_fa)
res <- glistmaker(
c(tiny_fa, "-w", "5", "-o", file.path(td, "tiny")),
stdout = file.path(td, "glistmaker.out.log"),
stderr = file.path(td, "glistmaker.err.log"),
error_on_status = FALSE
)
res$status
#> [1] 0Reproducibility Notes
- Keep input reference/VCF/BED/control metadata versioned with your output manifests.
- Prefer fixed
k, fixed profile settings, and fixed reference builds per study. - Re-run
niptmer_ref_qc()whenever panel filters are updated.