Biological plausibility

Author

Miguel Camacho Sánchez

Code
# Attach libraries and functions:

library(tidyverse)
library(hierfstat)
library(tidyGenR)
source("code/gen2hierfstat.R")

Reliability of the inferred genotypes

Several approaches were used to assess the biological plausibility of the genotypes inferred by tidyGenR.

Allele recurrence

The recurrence of each inferred allele across samples was evaluated, as alleles observed in multiple individuals are less likely to represent spurious PCR artifacts (Sommer et al., 2013). For each allele, the number of samples in which it was detected was computed.

gen <- readRDS("output/genotypes_x.rds")$ind_bs0

allele_rec <-
  gen |>
  dplyr::select(sample, locus, allele) |>
  group_by(locus, allele) |>
  reframe(frequency = n(),
          recurrence = length(unique(sample))
          ) |>
  arrange(frequency)

knitr::kable(head(allele_rec))
locus allele frequency recurrence
tmem87a 03 1 1
tmem87a 04 1 1
il34 03 2 2
gadd45g 02 3 3
klc2 02 3 3
mmp9 02 4 4

Only 2 allele/s were present in an unique sample, whereas the remaining 48 alleles were present in more than one sample.

Haplotype concordance

Inferred haplotypes were compared against previously published sequence data for the same species but other populations (Camacho-Sanchez et al., 2018, to assess concordance with independently generated datasets.

# genotype data from Camacho-Sanchez et al. 2026, including other populations of R. baluensis.
# trusmadi would correspond to the the data from here.
# reformat data
gen26 <-
  readRDS("data/raw/genotypes_camacho-2026.rds") |>
  filter(locus != "fetub") |>
  rowwise() |>
  mutate(loc_al = paste0(locus, allele)) |>
  select(population, loc_al) |>
  distinct()
# alleles for trusmadi
al_tm <-
  gen26 |>
  filter(population == "trusmadi") |>
  pull(loc_al)
# alleles for non-trusmadi
al_others <-
  gen26 |>
  filter(population != "trusmadi") |>
  pull(loc_al)
# shared
al_shared <-
  intersect(al_tm, al_others)
# unique to trusmadi
al_unique <-
  setdiff(al_tm, al_others)

A total of 32 haplotypes were shared with previously published sequences, whereas 16 were unique to this dataset.

Heterozygosity

Finally, departures from Hardy–Weinberg equilibrium (HWE) were evaluated as a downstream diagnostic of genotype reliability. As all samples analysed here originate from a single genetic population (Camacho-Sanchez et al., 2026), systematic locus-wise deviations from equilibrium expectations may indicate genotyping artefacts.

Observed (Ho) and expected (He) heterozygosities were estimated for each locus, and the inbreeding coefficient (Fis).

# Ho, He and Fis per locus
#  convert to hierfstat
genhf <-
  gen |>
  mutate(population = "trusmadi") |> # population col is mandatory for gen_tidy2hierfstat
  gen_tidy2hierfstat()
# compute stats
gen_div <-
  genhf |>
    dplyr::select(-sample, -population) |> 
  basic.stats()
# clean table
gen_div_clean <-
  gen_div$perloc |>
  select(Ho, Hs, Fis) |>
  mutate_all(~round(.x, 3)) |>
  rownames_to_column("locus")
  
knitr::kable(head(gen_div_clean))
locus Ho Hs Fis
abcg8 0.651 0.598 -0.088
alkbh7 0.000 0.000 NaN
apeh17 0.000 0.000 NaN
cd27 0.182 0.167 -0.089
chrna9 0.386 0.412 0.062
dhrs3 0.000 0.000 NaN

HWE

Departures from HWE were further assessed using chi-squared tests on the observed and expected genotypes:

#   reformat genotypes for genetics package.
hwdf <-
  gen |>
  gen_tidy2integers() |>
  gen_tidy2compact()

loci <-
  unique(hwdf$locus)

#   loop to compute hwe chisq per locus
hw_all_loci <-
  loci |>
  lapply(function(loc_i) {
    locus_i <-
      filter(hwdf, locus == loc_i) |>
      pull(genotype)
    lgen <-
      genetics::genotype(locus_i)
    if (length(attr(lgen, "allele.names"))  > 1) {
      genetics::HWE.chisq(lgen) 
    } else {
      NULL
    }
  }) |>
  setNames(loci)

# remove monomorphic
hw_all_loci <- 
  hw_all_loci[!sapply(hw_all_loci, is.null)]

# reformat hw table
hw_res <-
  hw_all_loci |>
  plyr::ldply(.id = "locus", function(x) {
    data.frame(chi_sq = x$statistic,
               p_val = x$p.value)
  })

knitr::kable(head(hw_res))
locus chi_sq p_val
abcg8 4.2216592 0.3735626
cd27 0.4400000 1.0000000
chrna9 0.1106995 0.7162284
fetub 30.5555556 0.0001000
fn3krp 0.0478203 1.0000000
gadd45g 0.0548097 1.0000000

Merge results from heterozygosity and HWE:

merged_rs <-
  gen_div_clean |>
  dplyr::select(locus, Ho, Hs, Fis) |>
  left_join(hw_res, by = "locus")
knitr::kable(merged_rs, digits = 2)
locus Ho Hs Fis chi_sq p_val
abcg8 0.65 0.60 -0.09 4.22 0.37
alkbh7 0.00 0.00 NaN NA NA
apeh17 0.00 0.00 NaN NA NA
cd27 0.18 0.17 -0.09 0.44 1.00
chrna9 0.39 0.41 0.06 0.11 0.72
dhrs3 0.00 0.00 NaN NA NA
fancg 0.00 0.00 NaN NA NA
fetub 0.91 0.50 -0.83 30.56 0.00
fn3krp 0.48 0.50 0.04 0.05 1.00
gadd45g 0.07 0.07 -0.02 0.05 1.00
il34 0.34 0.38 0.10 1.20 0.80
irf5 0.00 0.00 NaN NA NA
klc2 0.07 0.07 -0.02 0.05 1.00
mmp9 0.45 0.40 -0.14 2.21 0.66
ms4a2 0.00 0.00 NaN NA NA
mycbpap 0.00 0.00 NaN NA NA
nfkbia 0.52 0.54 0.03 0.72 0.89
npr2 0.30 0.34 0.14 0.71 0.37
p2rx1 0.41 0.42 0.03 0.01 1.00
pipox 0.00 0.00 NaN NA NA
ptgs2 0.00 0.00 NaN NA NA
rabac1 0.32 0.33 0.03 0.02 1.00
rgd735029 0.34 0.32 -0.08 1.01 1.00
rogdi 0.00 0.00 NaN NA NA
ssfa2 0.00 0.00 NaN NA NA
tmem87a 0.42 0.38 -0.09 4.60 0.17
usp20 0.30 0.25 -0.16 1.32 0.19

Results:

  • All loci had similar values of expected and observed heterozygosity.

  • All polymorphic loci had genotype frequencies under HWE expectations (all p-values > 0.05).

The only locus with an abnormal behaviour is fetub, which as described previously (Camacho-Sanchez et al. 2018), its high Ho seems to correspond to multiple loci.

Conclusion: there is no evidence of deviations in the genotype frequencies from null expectations, supporting the reliability on the genotype calls from tidyGenR.

Session Info

sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS 15.7.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] tidyGenR_0.1.7   hierfstat_0.5-11 lubridate_1.9.4  forcats_1.0.1   
 [5] stringr_1.6.0    dplyr_1.2.0      purrr_1.2.1      readr_2.1.6     
 [9] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.2    tidyverse_2.0.0 

loaded via a namespace (and not attached):
 [1] writexl_1.5.4               tidyselect_1.2.1           
 [3] farver_2.1.2                Biostrings_2.72.1          
 [5] S7_0.2.1                    bitops_1.0-9               
 [7] combinat_0.0-8              fastmap_1.2.0              
 [9] dada2_1.32.0                GenomicAlignments_1.40.0   
[11] digest_0.6.39               timechange_0.3.0           
[13] lifecycle_1.0.5             pwalign_1.0.0              
[15] gdata_3.0.1                 magrittr_2.0.4             
[17] compiler_4.4.0              rlang_1.1.7                
[19] tools_4.4.0                 yaml_2.3.12                
[21] knitr_1.51                  S4Arrays_1.4.1             
[23] htmlwidgets_1.6.4           interp_1.1-6               
[25] DelayedArray_0.30.1         plyr_1.8.9                 
[27] RColorBrewer_1.1-3          ShortRead_1.62.0           
[29] abind_1.4-8                 BiocParallel_1.38.0        
[31] withr_3.0.2                 hwriter_1.3.2.1            
[33] BiocGenerics_0.50.0         grid_4.4.0                 
[35] stats4_4.4.0                latticeExtra_0.6-31        
[37] colorspace_2.1-2            gtools_3.9.5               
[39] MASS_7.3-65                 scales_1.4.0               
[41] dichromat_2.0-0.1           SummarizedExperiment_1.34.0
[43] mvtnorm_1.3-3               cli_3.6.5                  
[45] rmarkdown_2.30              crayon_1.5.3               
[47] generics_0.1.4              RcppParallel_5.1.11-1      
[49] otel_0.2.0                  reshape2_1.4.5             
[51] httr_1.4.7                  tzdb_0.5.0                 
[53] genetics_1.3.8.1.3          DBI_1.2.3                  
[55] zlibbioc_1.50.0             parallel_4.4.0             
[57] XVector_0.44.0              matrixStats_1.5.0          
[59] vctrs_0.7.1                 Matrix_1.7-4               
[61] jsonlite_2.0.0              IRanges_2.38.1             
[63] hms_1.1.4                   patchwork_1.3.2            
[65] S4Vectors_0.42.1            jpeg_0.1-11                
[67] glue_1.8.0                  codetools_0.2-20           
[69] stringi_1.8.7               gtable_0.3.6               
[71] GenomeInfoDb_1.40.1         deldir_2.0-4               
[73] GenomicRanges_1.56.2        UCSC.utils_1.0.0           
[75] pillar_1.11.1               htmltools_0.5.9            
[77] GenomeInfoDbData_1.2.12     R6_2.6.1                   
[79] evaluate_1.0.5              Biobase_2.64.0             
[81] lattice_0.22-7              png_0.1-8                  
[83] Rsamtools_2.20.0            DECIPHER_3.0.0             
[85] Rcpp_1.1.1                  SparseArray_1.4.8          
[87] xfun_0.56                   MatrixGenerics_1.16.0      
[89] pkgconfig_2.0.3