Author

Miguel Camacho Sánchez

Code
# Attach libraries and functions:

library(tidyGenR)
library(tidyverse)
library(ShortRead)
source("code/gen2hierfstat.R")

# conditionals to run chunks
p_subseqs <- "output/variants_subseqs.rds"
eval_subsample <- !file.exists(p_subseqs)
if(file.exists(p_subseqs))
  variants_subseqs <- readRDS(p_subseqs)

Effect of coverage on genotypes called

The read depth for each locus can have an effect on the reliability of the genotypes. Simulated lower coverage can be obtained by re-sampling lower number of reads from the FASTQ files. Lower coverage FASTQ datasets were created and variants were called with tidyGenR. Some statistics relevant to the performance of the genotype calls were tracked along decreasing coverage.

Locus selection

Select loci whose average number of reads across samples is above 500.

# compute reads per locus
tr_reads <-
  reads_loci_samples(trunc_dir, pattern_fq = "F_filt.fastq.gz" ) |>
  pivot_longer(-sample,
               names_to = "locus",
               values_to = "reads") |>
  group_by(locus) |>
  reframe(av_reads = mean(reads),
          sd_reads = sd(reads)) |>
  arrange(av_reads) |>
  filter(locus != "fetub") # likely amplifying multiple loci (see Camacho-Sanchez et al. 2026)
# print loci to keep
loci2keep <-
  filter(tr_reads, 
         av_reads > thr_keep) |>
  pull(locus)
print(loci2keep)
 [1] "ms4a2"   "pipox"   "abcg8"   "npr2"    "ssfa2"   "cd27"    "fancg"  
 [8] "gadd45g" "rabac1"  "mycbpap" "ptgs2"   "rogdi"   "irf5"    "fn3krp" 
[15] "apeh17"  "usp20"   "il34"    "dhrs3"   "p2rx1"   "klc2"   

For the selected loci, a given number of reads will be randomly sub-sampled from the FASTQ.

Code
# input fastq
fq <-
  list.files(trunc_dir, pattern = "fastq.gz", full.names = T)
# sample sizes
sample_size <-
  c(500, 250, 125, 70, 50, 35, 20)
# fix omega as in Camacho-Sanchez et al. 2026
omega_a <- 10^-2

Sample read subsets and call variants

For each category of read, 500, 250, 125, 70, 50, 35, 20, the following steps were performed:

  • For each FASTQ file:
    • Determine the number of reads in the FASTQ.
    • If the number of reads is below sampling_size, copy the FASTQ to a temporary directory.
    • If the number of reads exceeds sampling_size, randomly sample sampling_size reads and write the FASTQ to a temporary directory. A seed is used to maintain F/R pairing.
Code
variants_subseqs <-
    sample_size |>
    lapply(function(n_thr) {
    # create temp dir specific to sample_size
    sub_reads <-
      file.path(tempdir(), paste0("subsample", n_thr))
    if (dir.exists(sub_reads)) unlink(sub_reads, recursive = TRUE, force = TRUE)
    dir.create(sub_reads)
    # write subsampled fastq to temp dir
    fq |>
    lapply(function(fq_i) {
      n_reads <- countFastq(fq_i)
      out_fq <- file.path(sub_reads, basename(fq_i))
      if(n_reads$records < n_thr) {
        file.copy(fq_i, out_fq)
      } else if (n_reads$records >= n_thr){
        set.seed(1) # critical for not loosing paired-read match
        sidx <- FastqSampler(fq_i, n_thr)
        sub_fq <- yield(sidx)
        writeFastq(sub_fq, 
                   out_fq,
                   compress = TRUE)
      #try(close(sidx), silent = TRUE) # remove warnings
      }
    invisible(NULL)
    })
    # clean close conections open by ShortRead
    gc()
    
    # call variants
    variant_call(
      in_dir = sub_reads,
      loci = loci2keep,
      rv_pattern = "R_filt.fastq.gz",
      c_unmerged = TRUE,
      multithread = TRUE,
      pool = FALSE,
      omega_a_f = omega_a,
      omega_a_r = omega_a,
      band_size = 0,
      ad = 2,
      maf = 0.1)
    }) |>
setNames(paste0("sub_sample", str_pad(sample_size, width = 3, side = "left", pad = "0")))

saveRDS(variants_subseqs, "output/variants_subseqs.rds")

A short script is run to verify the pairing between F/R reads is kept.

Code
temp_dir_eg <-
  file.path(tempdir(), paste0("subsample", sample_size[length(sample_size)])) |>
  list.files(full.names = T)

fq_ids <-
  lapply(c(1, 2), function(x) {
    readFastq(temp_dir_eg[x]) |>
      id() |>
      as.character()
  })

if(identical(fq_ids[1], fq_ids[2])) {
  message("The pairing between F and R reads has been kept")
} else {
  stop("The pairing between F/R paired reads has been lost.")
}

Evaluate the effect on the calls

A first quick look at Figure 1 shows that genotype calls are consistent from 500 reads down to around 50. Below 50, but mostly at 20x, many alleles are not detected:

Code
# get genotypes
gen_subseqs <-
  variants_subseqs |>
  lapply(function(x) genotype(x, ploidy = 2, ADt = 5))

# compare calls
comp_gen <-
  compare_calls(gen_subseqs)
comp_vars <- 
  compare_calls(variants_subseqs)
# compare genotype vs variants
comp_gen_vars <-
  names(variants_subseqs) |>
  lapply(function(set_i) {
    compare_calls(list(var = variants_subseqs[[set_i]],
                       gen = gen_subseqs[[set_i]]))
  }) |>
  setNames(names(variants_subseqs))
Code
comp_gen$plot2

Figure 1: Comparisson of genotype calls with decreasing coverage.

The genotypes from the subsets with larger number or reads are similar to each other.

To summarise the results, the following statistics were computed:

  • Observed heterozygosity (Ho), since lower coverage could hinder the determination of both alleles, leading to an increase of homozygosity.
  • Missingness, measured as missing genotype calls (i.e. cells in the sample x locus matrix with no data).
  • Allele similarity, quantified as the average proportion alleles in each call that are identical to the best set (with more alleles genotyped).
  • Distinct haplotypes, or total number of distinct alleles considering all loci (expressed as a proportion respect the the maximum).
  • Reads sampled, reads sampled 500, 250, 125, 70, 50, 35, 20, expressed as proportions to max sample_size.
Code
require(hierfstat); require(scales)

# total number of different haplotypes (sum across loci)
no_variants <-
  gen_subseqs |>
  vapply(function(var_i) length(unique(var_i$md5)), FUN.VALUE = numeric(1L)) |>
  as.data.frame() |>
  setNames("distinct_hapl") |>
  tibble::rownames_to_column("subsample")

# observed heterozygosity
ho <-
  gen_subseqs |>
  sapply(function(gen_i) {
    hf_i <-
      gen_i |>
      mutate(population = "trusmadi") |>
      gen_tidy2hierfstat() |>
      select(-sample, -population) |>
      basic.stats()
    ho_i <- 
      round(hf_i$overall[which(names(hf_i$overall) == "Ho")], 3)
    # 
  }) |>
  as.data.frame() |>
  setNames("Ho") |>
  tibble::rownames_to_column("subsample") |>
  mutate(subsample = str_remove(subsample, ".Ho$"))

# calls with at least one haplotype
calls_plus_hap <-
  gen_subseqs |>
  vapply(function(gen_i) {
    gen_i_wide <-
      gen_i |>
      gen_tidy2wide() |>
      select(-sample)
    round(
      sum(is.na(gen_i_wide)) / (ncol(gen_i_wide) * nrow(gen_i_wide)),
        3)
  },
  numeric(1)) |>
  as.data.frame() |>
  setNames("missingness") |>
  tibble::rownames_to_column("subsample")

# completness
completness <-
  comp_gen$tidy_dat |>
  select(-c(sample, locus, md5)) |>
  apply(2, sum) |>
  as.data.frame() |>
  setNames("allele_similarity") |>
  tibble::rownames_to_column("subsample")

# create df for plotting how some stats change with coverage
df_fortified_abs <-
  left_join(no_variants, ho, by = "subsample") |>
  left_join(completness) |>
  left_join(calls_plus_hap) |>
  mutate(sample_size =
           str_extract(subsample, "[0-9]+") |>
           as.numeric())
df_fortified_rel <-
  df_fortified_abs |>
  mutate(sample_size = sample_size / max(sample_size),
         distinct_hapl = distinct_hapl / max(distinct_hapl),
         allele_similarity = allele_similarity / max(allele_similarity)
  ) |>
  pivot_longer(-subsample, names_to = "parameter", values_to = "value") |>
  mutate(sample_size =
           str_extract(subsample, "[0-9]+") |>
           as.numeric())


# plot stats 
p_stats <-
  ggplot(df_fortified_rel, aes(x = sample_size, y = value, colour = parameter)) +
  geom_line(linetype = "dashed") +
  geom_point() +
  scale_x_continuous(
    trans = scales::trans_new(
      name = "revlog2",
      transform = function(x) -log2(x),
      inverse = function(x) 2^(-x)
    ),
    breaks = sort(unique(df_fortified_rel$sample_size)),
    labels = sort(unique(df_fortified_rel$sample_size))
  ) +
  scale_color_discrete(labels = c("Ho" = "Ho",
                                  "distinct_hapl" = "Distinct haplotypes",
                                  "allele_similarity" = "Allele similarity",
                                  "sample_size" = "Reads sampled",
                                  "missingness" = "Missingness"
  )) +
  xlab("Number of reads sampled") +
  ylab("") +
  theme_minimal()
knitr::kable(df_fortified_abs)
subsample distinct_hapl Ho allele_similarity missingness sample_size
sub_sample500 32 0.155 1015 0.001 500
sub_sample250 32 0.155 1015 0.001 250
sub_sample125 32 0.155 1015 0.001 125
sub_sample070 32 0.155 1015 0.001 70
sub_sample050 32 0.155 1015 0.001 50
sub_sample035 33 0.156 1016 0.001 35
sub_sample020 34 0.150 1010 0.001 20
Code
print(p_stats)

Figure 2: Assessment of quality parameters decreasing coverage.

Genotype calls were stable from 500x to around 50x (Figure 2). Below 50x, but mostly at 20x the missing calls increase from nearly 0 to 4%. The number of distinct haplotypes detected was little affected even in the lowest coverage (20x) tested. At 20x, some alleles from heterozygous genotypes were lost (see ‘Allele similarity’), causing the heterozygosity to sink from 35x to 20x.

Code
# save plots
saveRDS(
  list(p_stats = p_stats,
       p_comp = comp_gen$plot2),
  "output/plots-assessment-coverage.rds")

Session Info

sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS 15.7.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] scales_1.4.0                hierfstat_0.5-11           
 [3] ShortRead_1.62.0            GenomicAlignments_1.40.0   
 [5] SummarizedExperiment_1.34.0 Biobase_2.64.0             
 [7] MatrixGenerics_1.16.0       matrixStats_1.5.0          
 [9] Rsamtools_2.20.0            GenomicRanges_1.56.2       
[11] Biostrings_2.72.1           GenomeInfoDb_1.40.1        
[13] XVector_0.44.0              IRanges_2.38.1             
[15] S4Vectors_0.42.1            BiocParallel_1.38.0        
[17] BiocGenerics_0.50.0         lubridate_1.9.4            
[19] forcats_1.0.1               stringr_1.6.0              
[21] dplyr_1.2.0                 purrr_1.2.1                
[23] readr_2.1.6                 tidyr_1.3.2                
[25] tibble_3.3.1                ggplot2_4.0.2              
[27] tidyverse_2.0.0             tidyGenR_0.1.7             

loaded via a namespace (and not attached):
 [1] writexl_1.5.4           tidyselect_1.2.1        farver_2.1.2           
 [4] S7_0.2.1                bitops_1.0-9            fastmap_1.2.0          
 [7] dada2_1.32.0            digest_0.6.39           timechange_0.3.0       
[10] lifecycle_1.0.5         pwalign_1.0.0           magrittr_2.0.4         
[13] compiler_4.4.0          rlang_1.1.7             tools_4.4.0            
[16] yaml_2.3.12             knitr_1.51              labeling_0.4.3         
[19] S4Arrays_1.4.1          htmlwidgets_1.6.4       interp_1.1-6           
[22] DelayedArray_0.30.1     plyr_1.8.9              RColorBrewer_1.1-3     
[25] abind_1.4-8             withr_3.0.2             hwriter_1.3.2.1        
[28] grid_4.4.0              latticeExtra_0.6-31     colorspace_2.1-2       
[31] dichromat_2.0-0.1       cli_3.6.5               rmarkdown_2.30         
[34] crayon_1.5.3            generics_0.1.4          otel_0.2.0             
[37] RcppParallel_5.1.11-1   httr_1.4.7              reshape2_1.4.5         
[40] tzdb_0.5.0              DBI_1.2.3               zlibbioc_1.50.0        
[43] parallel_4.4.0          vctrs_0.7.1             Matrix_1.7-4           
[46] jsonlite_2.0.0          hms_1.1.4               patchwork_1.3.2        
[49] jpeg_0.1-11             glue_1.8.0              codetools_0.2-20       
[52] stringi_1.8.7           gtable_0.3.6            deldir_2.0-4           
[55] UCSC.utils_1.0.0        pillar_1.11.1           htmltools_0.5.9        
[58] GenomeInfoDbData_1.2.12 R6_2.6.1                evaluate_1.0.5         
[61] lattice_0.22-7          png_0.1-8               DECIPHER_3.0.0         
[64] Rcpp_1.1.1              SparseArray_1.4.8       xfun_0.56              
[67] pkgconfig_2.0.3