Skip to contents

RGenomeTester4 provides:

  • processx wrappers around bundled GenomeTester4 binaries
  • wrappers for bundled utility scripts
  • stateless NIPTmer-style scoring utilities
  • reference-building utilities for both paper-style and population-adapted workflows

The package focuses on reproducible file-based k-mer workflows in R while keeping the heavy k-mer operations in the proven GenomeTester4 tools.

Installation

You can install the development version from GitHub with:

# install.packages("pak")
pak::pak("sounkou-bioinfo/RGenomeTester4")

Quick Start

library(RGenomeTester4)

# Locate bundled binaries
gt4_binary_path("glistmaker")
#> [1] "/usr/local/lib/R/site-library/RGenomeTester4/bin/glistmaker"

# ULCWGS-oriented defaults for reference building
niptmer_ref_profile_ulcwgs()
#> $vcf_engine
#> [1] "legacy_compatible"
#> 
#> $pool_mode
#> [1] "block"
#> 
#> $model
#> [1] "nbinom"
#> 
#> $lower_p
#> [1] 0.01
#> 
#> $upper_p
#> [1] 0.99
#> 
#> $block_size
#> [1] 50
#> 
#> $min_block_support
#> [1] 0.7
#> 
#> $distribution_max
#> [1] 500

Build Reference Panels

There are two main build strategies:

  1. paper_plus_population: start from an existing chromosome panel and apply your population filters.
  2. from_scratch: generate chromosome lists from FASTA and then apply filters.
library(RGenomeTester4)

td <- tempfile("rgenometester4-readme-")
dir.create(td)

ref_dir <- file.path(td, "ref")
dir.create(ref_dir)
chr1_fa <- file.path(ref_dir, "1.fa")
chr2_fa <- file.path(ref_dir, "2.fa")
writeLines(c(
  ">1",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTA"
), chr1_fa)
writeLines(c(
  ">2",
  "TTGACCGATTAACCGGTTAGCATGGCATTAAG"
), chr2_fa)

base_dir <- file.path(td, "base_panel")
dir.create(base_dir)
stopifnot(glistmaker(c(chr1_fa, "-w", "5", "-o", file.path(base_dir, "1")))$status == 0L)
stopifnot(glistmaker(c(chr2_fa, "-w", "5", "-o", file.path(base_dir, "2")))$status == 0L)

genome_fa <- file.path(td, "genome.fa")
writeLines(c(
  ">1",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTA",
  ">2",
  "TTGACCGATTAACCGGTTAGCATGGCATTAAG"
), genome_fa)

vcf_path <- file.path(td, "1.vcf")
writeLines(c(
  "##fileformat=VCFv4.2",
  "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
  "1\t6\trs1\tG\tA\t.\t.\tCAF=0.90,0.10"
), vcf_path)

bed_path <- file.path(td, "regions.bed")
writeLines(c("1\t0\t8", "2\t10\t20"), bed_path)

ctl1_fa <- file.path(td, "ctl1.fa")
ctl2_fa <- file.path(td, "ctl2.fa")
writeLines(c(
  ">ctl1",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTAACGTTGCAA"
), ctl1_fa)
writeLines(c(
  ">ctl2",
  "TTGACCGATTAACCGGTTAGCATGGCATTAAGTTGACCGAT"
), ctl2_fa)
stopifnot(glistmaker(c(ctl1_fa, "-w", "5", "-o", file.path(td, "ctl1")))$status == 0L)
stopifnot(glistmaker(c(ctl2_fa, "-w", "5", "-o", file.path(td, "ctl2")))$status == 0L)
control_lists <- c(file.path(td, "ctl1_5.list"), file.path(td, "ctl2_5.list"))

profile <- niptmer_ref_profile_ulcwgs()
profile$block_size <- 2

panel <- niptmer_ref_build(
  strategy = "paper_plus_population",
  base_panel_dir = base_dir,
  reference_fasta_dir = ref_dir,
  genome_fasta = genome_fa,
  pop_vcf_files = vcf_path,
  pop_bed_files = bed_path,
  control_lists = control_lists,
  k = 5,
  chr_ids = c("1", "2"),
  profile = profile,
  out_dir = file.path(td, "paper_plus")
)

niptmer_ref_qc(panel)
#>   chr
#> 1   1
#> 2   2
#>                                                                                        list_path
#> 1 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/paper_plus/panel_stable/1_stable_5_intrsec.list
#> 2 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/paper_plus/panel_stable/2_stable_5_intrsec.list
#>   n_kmers       gc
#> 1      15 0.466667
#> 2      20 0.440000

2) From Scratch

panel <- niptmer_ref_build(
  strategy = "from_scratch",
  reference_fasta_dir = ref_dir,
  genome_fasta = genome_fa,
  pop_vcf_files = vcf_path,
  control_lists = control_lists,
  k = 5,
  chr_ids = c("1", "2"),
  vcf_engine = "legacy_compatible",
  model = "nbinom",
  pool_mode = "block",
  block_size = 2,
  min_block_support = 0.70,
  out_dir = file.path(td, "from_scratch")
)

niptmer_ref_qc(panel)
#>   chr
#> 1   1
#> 2   2
#>                                                                                          list_path
#> 1 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/from_scratch/panel_stable/1_stable_5_intrsec.list
#> 2 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/from_scratch/panel_stable/2_stable_5_intrsec.list
#>   n_kmers gc
#> 1       0 NA
#> 2       0 NA

# Note: with strict uniqueness filtering (`-c 2` behavior inherited from
# GenomeTester4 workflow), tiny synthetic examples can yield very small or
# even empty final lists. Use realistic chromosome-scale references in practice.

VCF Engine Modes

niptmer_ref_blacklist_from_vcf() supports two expansion modes:

  • legacy_compatible (default): overlap-aware expansion that approximates the legacy hapl_generator.pl flow.
  • simple: independent per-variant expansion.
bl <- niptmer_ref_blacklist_from_vcf(
  vcf_files = vcf_path,
  reference_fasta_dir = ref_dir,
  k = 5,
  min_af = 0.01,
  engine = "legacy_compatible",
  out_dir = file.path(td, "vcf_mode")
)

bl
#> $path
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/vcf_mode/pop_snp_blacklist_5.list"
#> 
#> $k
#> [1] 5
#> 
#> $engine
#> [1] "legacy_compatible"
#> 
#> $variants_seen
#> [1] 1
#> 
#> $variant_alleles_used
#> [1] 1
#> 
#> $unique_kmers
#> [1] 10

Score a K-mer Table

library(RGenomeTester4)

counts_path <- system.file("extdata", "niptmer_counts_example.tsv", package = "RGenomeTester4")
controls_path <- system.file("extdata", "niptmer_controls_example.tsv", package = "RGenomeTester4")

counts <- niptmer_read_table(counts_path)
controls <- readLines(controls_path, warn = FALSE)

scores <- niptmer_score_samples(
  count_table = counts,
  control = controls,
  normalize = TRUE,
  reference_mode = "fixed"
)

calls <- niptmer_interpret(scores)
head(calls)
#>   SampleName AA SA Distance Gender
#> 1       2NV9    XY     Good    boy

File-Oriented NIPTmer Steps

The package now exposes the legacy niptmer segment as composable file-oriented utilities.

counts_path <- system.file("extdata", "niptmer_counts_example.tsv", package = "RGenomeTester4")
controls_path <- system.file("extdata", "niptmer_controls_example.tsv", package = "RGenomeTester4")

counts <- niptmer_read_table(counts_path)
reference_counts <- counts[-nrow(counts), , drop = FALSE]
sample_row <- counts[nrow(counts), , drop = FALSE]

artifact_dir <- file.path(td, "niptmer-artifacts")
dir.create(artifact_dir, recursive = TRUE, showWarnings = FALSE)

combined_counts <- niptmer_bind_count_table(
  reference_table = reference_counts,
  sample_rows = sample_row,
  out_file = file.path(artifact_dir, "combined_counts.tsv")
)

zscores <- niptmer_write_zscore(
  count_table = combined_counts,
  control = controls_path,
  normalize = TRUE,
  out_file = file.path(artifact_dir, "sample_zscore.txt")
)

xya <- niptmer_write_xya(
  zscores,
  out_file = file.path(artifact_dir, "sample_XYA_result.tsv")
)

xya
#>   SampleName AA SA Distance Gender
#> 1       2NV9    XY     Good    boy

FASTQ -> .list and .list -> count row are available separately when you want to keep orchestration outside the package. If you pass paired FASTQs, the listing helper concatenates them internally before glistmaker:

sample_r1_fastq <- file.path(td, "sample_R1.fastq")
sample_r2_fastq <- file.path(td, "sample_R2.fastq")
writeLines(c("@sample-r1", "ACGTACGT", "+", "FFFFFFFF"), sample_r1_fastq)
writeLines(c("@sample-r2", "GGGGTTTT", "+", "FFFFFFFF"), sample_r2_fastq)

chr_dir <- file.path(td, "chr_lists")
dir.create(chr_dir, recursive = TRUE, showWarnings = FALSE)

chr1_fastq <- file.path(td, "chr1.fastq")
chrX_fastq <- file.path(td, "chrX.fastq")
writeLines(c("@chr1-read", "ACGTAC", "+", "FFFFFF"), chr1_fastq)
writeLines(c("@chrX-read", "TTTTTT", "+", "FFFFFF"), chrX_fastq)

sample_list <- niptmer_make_sample_list(
  c(sample_r1_fastq, sample_r2_fastq),
  out_prefix = file.path(td, "sample"),
  k = 3,
  threads = 1
)
niptmer_make_sample_list(chr1_fastq, file.path(chr_dir, "1_cleaned"), k = 3, threads = 1)
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/chr_lists/1_cleaned_3.list"
niptmer_make_sample_list(chrX_fastq, file.path(chr_dir, "X_cleaned"), k = 3, threads = 1)
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/chr_lists/X_cleaned_3.list"

niptmer_count_sample_list(
  sample_list = sample_list,
  chr_list_dir = chr_dir,
  chromosomes = c("1", "X"),
  chr_suffix = "_cleaned_3.list"
)
#>      CHR TOTAL UNIQUE       GC 1 X
#> 1 sample    12      6 0.527778 2 1

For a thin end-to-end replacement of the legacy niptmer block, use niptmer_run_sample():

niptmer_run_sample(
  fastq_files = c("sample_R1.fastq.gz", "sample_R2.fastq.gz"),
  chr_list_dir = "panel_lists",
  reference_table = "reference_counts.tsv",
  control = "reference_controls.tsv",
  out_dir = "sample_niptmer",
  sample_name = "Sample123",
  k = 25,
  threads = 8,
  chr_suffix = "_cleaned_25.list",
  normalize = TRUE,
  interpret_args = list(
    sex_call_spec = list(boy = list(euploid_abs = 3, call = 3.5))
  )
)

Processx Output Safety

For heavy commands, avoid collecting large output in memory unless you need to parse it.

  • Default wrappers work well for normal runs.
  • If you expect large output, send logs to files via stdout = "path.log", stderr = "path.err".
  • When parsing machine-readable output (--distribution, --count_only, --gc), use pipe capture (stdout = "|") intentionally.
tiny_fa <- file.path(td, "tiny.fa")
writeLines(c(
  ">tiny",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTA"
), tiny_fa)

res <- glistmaker(
  c(tiny_fa, "-w", "5", "-o", file.path(td, "tiny")),
  stdout = file.path(td, "glistmaker.out.log"),
  stderr = file.path(td, "glistmaker.err.log"),
  error_on_status = FALSE
)

res$status
#> [1] 0

Reproducibility Notes

  • Keep input reference/VCF/BED/control metadata versioned with your output manifests.
  • Prefer fixed k, fixed profile settings, and fixed reference builds per study.
  • Re-run niptmer_ref_qc() whenever panel filters are updated.