K-Mer Analysis Wrappers and Reference Utilities for R • RGenomeTester4

RGenomeTester4 provides:

processx wrappers around bundled GenomeTester4 binaries
wrappers for bundled utility scripts
stateless NIPTmer-style scoring utilities
reference-building utilities for both paper-style and population-adapted workflows

The package focuses on reproducible file-based k-mer workflows in R while keeping the heavy k-mer operations in the proven GenomeTester4 tools.

Installation

You can install the development version from GitHub with:

# install.packages("pak")
pak::pak("sounkou-bioinfo/RGenomeTester4")

Quick Start

library(RGenomeTester4)

# Locate bundled binaries
gt4_binary_path("glistmaker")
#> [1] "/usr/local/lib/R/site-library/RGenomeTester4/bin/glistmaker"

# ULCWGS-oriented defaults for reference building
niptmer_ref_profile_ulcwgs()
#> $vcf_engine
#> [1] "legacy_compatible"
#> 
#> $pool_mode
#> [1] "block"
#> 
#> $model
#> [1] "nbinom"
#> 
#> $lower_p
#> [1] 0.01
#> 
#> $upper_p
#> [1] 0.99
#> 
#> $block_size
#> [1] 50
#> 
#> $min_block_support
#> [1] 0.7
#> 
#> $distribution_max
#> [1] 500

Build Reference Panels

There are two main build strategies:

paper_plus_population: start from an existing chromosome panel and apply your population filters.
from_scratch: generate chromosome lists from FASTA and then apply filters.

1) Paper + Population (recommended when you already have a baseline panel)

library(RGenomeTester4)

td <- tempfile("rgenometester4-readme-")
dir.create(td)

ref_dir <- file.path(td, "ref")
dir.create(ref_dir)
chr1_fa <- file.path(ref_dir, "1.fa")
chr2_fa <- file.path(ref_dir, "2.fa")
writeLines(c(
  ">1",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTA"
), chr1_fa)
writeLines(c(
  ">2",
  "TTGACCGATTAACCGGTTAGCATGGCATTAAG"
), chr2_fa)

base_dir <- file.path(td, "base_panel")
dir.create(base_dir)
stopifnot(glistmaker(c(chr1_fa, "-w", "5", "-o", file.path(base_dir, "1")))$status == 0L)
stopifnot(glistmaker(c(chr2_fa, "-w", "5", "-o", file.path(base_dir, "2")))$status == 0L)

genome_fa <- file.path(td, "genome.fa")
writeLines(c(
  ">1",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTA",
  ">2",
  "TTGACCGATTAACCGGTTAGCATGGCATTAAG"
), genome_fa)

vcf_path <- file.path(td, "1.vcf")
writeLines(c(
  "##fileformat=VCFv4.2",
  "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
  "1\t6\trs1\tG\tA\t.\t.\tCAF=0.90,0.10"
), vcf_path)

bed_path <- file.path(td, "regions.bed")
writeLines(c("1\t0\t8", "2\t10\t20"), bed_path)

ctl1_fa <- file.path(td, "ctl1.fa")
ctl2_fa <- file.path(td, "ctl2.fa")
writeLines(c(
  ">ctl1",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTAACGTTGCAA"
), ctl1_fa)
writeLines(c(
  ">ctl2",
  "TTGACCGATTAACCGGTTAGCATGGCATTAAGTTGACCGAT"
), ctl2_fa)
stopifnot(glistmaker(c(ctl1_fa, "-w", "5", "-o", file.path(td, "ctl1")))$status == 0L)
stopifnot(glistmaker(c(ctl2_fa, "-w", "5", "-o", file.path(td, "ctl2")))$status == 0L)
control_lists <- c(file.path(td, "ctl1_5.list"), file.path(td, "ctl2_5.list"))

profile <- niptmer_ref_profile_ulcwgs()
profile$block_size <- 2

panel <- niptmer_ref_build(
  strategy = "paper_plus_population",
  base_panel_dir = base_dir,
  reference_fasta_dir = ref_dir,
  genome_fasta = genome_fa,
  pop_vcf_files = vcf_path,
  pop_bed_files = bed_path,
  control_lists = control_lists,
  k = 5,
  chr_ids = c("1", "2"),
  profile = profile,
  out_dir = file.path(td, "paper_plus")
)

niptmer_ref_qc(panel)
#>   chr
#> 1   1
#> 2   2
#>                                                                                        list_path
#> 1 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/paper_plus/panel_stable/1_stable_5_intrsec.list
#> 2 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/paper_plus/panel_stable/2_stable_5_intrsec.list
#>   n_kmers       gc
#> 1      15 0.466667
#> 2      20 0.440000

2) From Scratch

panel <- niptmer_ref_build(
  strategy = "from_scratch",
  reference_fasta_dir = ref_dir,
  genome_fasta = genome_fa,
  pop_vcf_files = vcf_path,
  control_lists = control_lists,
  k = 5,
  chr_ids = c("1", "2"),
  vcf_engine = "legacy_compatible",
  model = "nbinom",
  pool_mode = "block",
  block_size = 2,
  min_block_support = 0.70,
  out_dir = file.path(td, "from_scratch")
)

niptmer_ref_qc(panel)
#>   chr
#> 1   1
#> 2   2
#>                                                                                          list_path
#> 1 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/from_scratch/panel_stable/1_stable_5_intrsec.list
#> 2 /tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/from_scratch/panel_stable/2_stable_5_intrsec.list
#>   n_kmers gc
#> 1       0 NA
#> 2       0 NA

# Note: with strict uniqueness filtering (`-c 2` behavior inherited from
# GenomeTester4 workflow), tiny synthetic examples can yield very small or
# even empty final lists. Use realistic chromosome-scale references in practice.

VCF Engine Modes

niptmer_ref_blacklist_from_vcf() supports two expansion modes:

legacy_compatible (default): overlap-aware expansion that approximates the legacy hapl_generator.pl flow.
simple: independent per-variant expansion.

bl <- niptmer_ref_blacklist_from_vcf(
  vcf_files = vcf_path,
  reference_fasta_dir = ref_dir,
  k = 5,
  min_af = 0.01,
  engine = "legacy_compatible",
  out_dir = file.path(td, "vcf_mode")
)

bl
#> $path
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/vcf_mode/pop_snp_blacklist_5.list"
#> 
#> $k
#> [1] 5
#> 
#> $engine
#> [1] "legacy_compatible"
#> 
#> $variants_seen
#> [1] 1
#> 
#> $variant_alleles_used
#> [1] 1
#> 
#> $unique_kmers
#> [1] 10

Score a K-mer Table

library(RGenomeTester4)

counts_path <- system.file("extdata", "niptmer_counts_example.tsv", package = "RGenomeTester4")
controls_path <- system.file("extdata", "niptmer_controls_example.tsv", package = "RGenomeTester4")

counts <- niptmer_read_table(counts_path)
controls <- readLines(controls_path, warn = FALSE)

scores <- niptmer_score_samples(
  count_table = counts,
  control = controls,
  normalize = TRUE,
  reference_mode = "fixed"
)

calls <- niptmer_interpret(scores)
head(calls)
#>   SampleName AA SA Distance Gender
#> 1       2NV9    XY     Good    boy

File-Oriented NIPTmer Steps

The package now exposes the legacy niptmer segment as composable file-oriented utilities.

counts_path <- system.file("extdata", "niptmer_counts_example.tsv", package = "RGenomeTester4")
controls_path <- system.file("extdata", "niptmer_controls_example.tsv", package = "RGenomeTester4")

counts <- niptmer_read_table(counts_path)
reference_counts <- counts[-nrow(counts), , drop = FALSE]
sample_row <- counts[nrow(counts), , drop = FALSE]

artifact_dir <- file.path(td, "niptmer-artifacts")
dir.create(artifact_dir, recursive = TRUE, showWarnings = FALSE)

combined_counts <- niptmer_bind_count_table(
  reference_table = reference_counts,
  sample_rows = sample_row,
  out_file = file.path(artifact_dir, "combined_counts.tsv")
)

zscores <- niptmer_write_zscore(
  count_table = combined_counts,
  control = controls_path,
  normalize = TRUE,
  out_file = file.path(artifact_dir, "sample_zscore.txt")
)

xya <- niptmer_write_xya(
  zscores,
  out_file = file.path(artifact_dir, "sample_XYA_result.tsv")
)

xya
#>   SampleName AA SA Distance Gender
#> 1       2NV9    XY     Good    boy

FASTQ -> .list and .list -> count row are available separately when you want to keep orchestration outside the package. If you pass paired FASTQs, the listing helper concatenates them internally before glistmaker:

sample_r1_fastq <- file.path(td, "sample_R1.fastq")
sample_r2_fastq <- file.path(td, "sample_R2.fastq")
writeLines(c("@sample-r1", "ACGTACGT", "+", "FFFFFFFF"), sample_r1_fastq)
writeLines(c("@sample-r2", "GGGGTTTT", "+", "FFFFFFFF"), sample_r2_fastq)

chr_dir <- file.path(td, "chr_lists")
dir.create(chr_dir, recursive = TRUE, showWarnings = FALSE)

chr1_fastq <- file.path(td, "chr1.fastq")
chrX_fastq <- file.path(td, "chrX.fastq")
writeLines(c("@chr1-read", "ACGTAC", "+", "FFFFFF"), chr1_fastq)
writeLines(c("@chrX-read", "TTTTTT", "+", "FFFFFF"), chrX_fastq)

sample_list <- niptmer_make_sample_list(
  c(sample_r1_fastq, sample_r2_fastq),
  out_prefix = file.path(td, "sample"),
  k = 3,
  threads = 1
)
niptmer_make_sample_list(chr1_fastq, file.path(chr_dir, "1_cleaned"), k = 3, threads = 1)
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/chr_lists/1_cleaned_3.list"
niptmer_make_sample_list(chrX_fastq, file.path(chr_dir, "X_cleaned"), k = 3, threads = 1)
#> [1] "/tmp/Rtmpqy4GnY/rgenometester4-readme-29dee0d0/chr_lists/X_cleaned_3.list"

niptmer_count_sample_list(
  sample_list = sample_list,
  chr_list_dir = chr_dir,
  chromosomes = c("1", "X"),
  chr_suffix = "_cleaned_3.list"
)
#>      CHR TOTAL UNIQUE       GC 1 X
#> 1 sample    12      6 0.527778 2 1

For a thin end-to-end replacement of the legacy niptmer block, use niptmer_run_sample():

niptmer_run_sample(
  fastq_files = c("sample_R1.fastq.gz", "sample_R2.fastq.gz"),
  chr_list_dir = "panel_lists",
  reference_table = "reference_counts.tsv",
  control = "reference_controls.tsv",
  out_dir = "sample_niptmer",
  sample_name = "Sample123",
  k = 25,
  threads = 8,
  chr_suffix = "_cleaned_25.list",
  normalize = TRUE,
  interpret_args = list(
    sex_call_spec = list(boy = list(euploid_abs = 3, call = 3.5))
  )
)

Processx Output Safety

For heavy commands, avoid collecting large output in memory unless you need to parse it.

Default wrappers work well for normal runs.
If you expect large output, send logs to files via stdout = "path.log", stderr = "path.err".
When parsing machine-readable output (--distribution, --count_only, --gc), use pipe capture (stdout = "|") intentionally.

tiny_fa <- file.path(td, "tiny.fa")
writeLines(c(
  ">tiny",
  "ACGTTGCAAGTCCTAGGATCCTTACGATCGTA"
), tiny_fa)

res <- glistmaker(
  c(tiny_fa, "-w", "5", "-o", file.path(td, "tiny")),
  stdout = file.path(td, "glistmaker.out.log"),
  stderr = file.path(td, "glistmaker.err.log"),
  error_on_status = FALSE
)

res$status
#> [1] 0

Reproducibility Notes

Keep input reference/VCF/BED/control metadata versioned with your output manifests.
Prefer fixed k, fixed profile settings, and fixed reference builds per study.
Re-run niptmer_ref_qc() whenever panel filters are updated.