Biological plausibility

Author

Miguel Camacho Sánchez

Code

# Attach libraries and functions:

library(tidyverse)
library(hierfstat)
library(tidyGenR)
source("code/gen2hierfstat.R")

Reliability of the inferred genotypes

Several approaches were used to assess the biological plausibility of the genotypes inferred by tidyGenR.

Allele recurrence

The recurrence of each inferred allele across samples was evaluated, as alleles observed in multiple individuals are less likely to represent spurious PCR artifacts (Sommer et al., 2013). For each allele, the number of samples in which it was detected was computed.

gen <- readRDS("output/genotypes_x.rds")$ind_bs0

allele_rec <-
  gen |>
  dplyr::select(sample, locus, allele) |>
  group_by(locus, allele) |>
  reframe(frequency = n(),
          recurrence = length(unique(sample))
          ) |>
  arrange(frequency)

knitr::kable(head(allele_rec))

locus	allele	frequency	recurrence
tmem87a	03	1	1
tmem87a	04	1	1
il34	03	2	2
gadd45g	02	3	3
klc2	02	3	3
mmp9	02	4	4

Only 2 allele/s were present in an unique sample, whereas the remaining 48 alleles were present in more than one sample.

Haplotype concordance

Inferred haplotypes were compared against previously published sequence data for the same species but other populations (Camacho-Sanchez et al., 2018, to assess concordance with independently generated datasets.

# genotype data from Camacho-Sanchez et al. 2026, including other populations of R. baluensis.
# trusmadi would correspond to the the data from here.
# reformat data
gen26 <-
  readRDS("data/raw/genotypes_camacho-2026.rds") |>
  filter(locus != "fetub") |>
  rowwise() |>
  mutate(loc_al = paste0(locus, allele)) |>
  select(population, loc_al) |>
  distinct()
# alleles for trusmadi
al_tm <-
  gen26 |>
  filter(population == "trusmadi") |>
  pull(loc_al)
# alleles for non-trusmadi
al_others <-
  gen26 |>
  filter(population != "trusmadi") |>
  pull(loc_al)
# shared
al_shared <-
  intersect(al_tm, al_others)
# unique to trusmadi
al_unique <-
  setdiff(al_tm, al_others)

A total of 32 haplotypes were shared with previously published sequences, whereas 16 were unique to this dataset.

Heterozygosity

Finally, departures from Hardy–Weinberg equilibrium (HWE) were evaluated as a downstream diagnostic of genotype reliability. As all samples analysed here originate from a single genetic population (Camacho-Sanchez et al., 2026), systematic locus-wise deviations from equilibrium expectations may indicate genotyping artefacts.

Observed (Ho) and expected (He) heterozygosities were estimated for each locus, and the inbreeding coefficient (Fis).

# Ho, He and Fis per locus
#  convert to hierfstat
genhf <-
  gen |>
  mutate(population = "trusmadi") |> # population col is mandatory for gen_tidy2hierfstat
  gen_tidy2hierfstat()
# compute stats
gen_div <-
  genhf |>
    dplyr::select(-sample, -population) |> 
  basic.stats()
# clean table
gen_div_clean <-
  gen_div$perloc |>
  select(Ho, Hs, Fis) |>
  mutate_all(~round(.x, 3)) |>
  rownames_to_column("locus")
  
knitr::kable(head(gen_div_clean))

locus	Ho	Hs	Fis
abcg8	0.651	0.598	-0.088
alkbh7	0.000	0.000	NaN
apeh17	0.000	0.000	NaN
cd27	0.182	0.167	-0.089
chrna9	0.386	0.412	0.062
dhrs3	0.000	0.000	NaN

HWE

Departures from HWE were further assessed using chi-squared tests on the observed and expected genotypes:

#   reformat genotypes for genetics package.
hwdf <-
  gen |>
  gen_tidy2integers() |>
  gen_tidy2compact()

loci <-
  unique(hwdf$locus)

#   loop to compute hwe chisq per locus
hw_all_loci <-
  loci |>
  lapply(function(loc_i) {
    locus_i <-
      filter(hwdf, locus == loc_i) |>
      pull(genotype)
    lgen <-
      genetics::genotype(locus_i)
    if (length(attr(lgen, "allele.names"))  > 1) {
      genetics::HWE.chisq(lgen) 
    } else {
      NULL
    }
  }) |>
  setNames(loci)

# remove monomorphic
hw_all_loci <- 
  hw_all_loci[!sapply(hw_all_loci, is.null)]

# reformat hw table
hw_res <-
  hw_all_loci |>
  plyr::ldply(.id = "locus", function(x) {
    data.frame(chi_sq = x$statistic,
               p_val = x$p.value)
  })

knitr::kable(head(hw_res))

locus	chi_sq	p_val
abcg8	4.2216592	0.3735626
cd27	0.4400000	1.0000000
chrna9	0.1106995	0.7162284
fetub	30.5555556	0.0001000
fn3krp	0.0478203	1.0000000
gadd45g	0.0548097	1.0000000

Merge results from heterozygosity and HWE:

merged_rs <-
  gen_div_clean |>
  dplyr::select(locus, Ho, Hs, Fis) |>
  left_join(hw_res, by = "locus")
knitr::kable(merged_rs, digits = 2)

locus	Ho	Hs	Fis	chi_sq	p_val
abcg8	0.65	0.60	-0.09	4.22	0.37
alkbh7	0.00	0.00	NaN	NA	NA
apeh17	0.00	0.00	NaN	NA	NA
cd27	0.18	0.17	-0.09	0.44	1.00
chrna9	0.39	0.41	0.06	0.11	0.72
dhrs3	0.00	0.00	NaN	NA	NA
fancg	0.00	0.00	NaN	NA	NA
fetub	0.91	0.50	-0.83	30.56	0.00
fn3krp	0.48	0.50	0.04	0.05	1.00
gadd45g	0.07	0.07	-0.02	0.05	1.00
il34	0.34	0.38	0.10	1.20	0.80
irf5	0.00	0.00	NaN	NA	NA
klc2	0.07	0.07	-0.02	0.05	1.00
mmp9	0.45	0.40	-0.14	2.21	0.66
ms4a2	0.00	0.00	NaN	NA	NA
mycbpap	0.00	0.00	NaN	NA	NA
nfkbia	0.52	0.54	0.03	0.72	0.89
npr2	0.30	0.34	0.14	0.71	0.37
p2rx1	0.41	0.42	0.03	0.01	1.00
pipox	0.00	0.00	NaN	NA	NA
ptgs2	0.00	0.00	NaN	NA	NA
rabac1	0.32	0.33	0.03	0.02	1.00
rgd735029	0.34	0.32	-0.08	1.01	1.00
rogdi	0.00	0.00	NaN	NA	NA
ssfa2	0.00	0.00	NaN	NA	NA
tmem87a	0.42	0.38	-0.09	4.60	0.17
usp20	0.30	0.25	-0.16	1.32	0.19

Results:

All loci had similar values of expected and observed heterozygosity.
All polymorphic loci had genotype frequencies under HWE expectations (all p-values > 0.05).

The only locus with an abnormal behaviour is fetub, which as described previously (Camacho-Sanchez et al. 2018), its high Ho seems to correspond to multiple loci.

Conclusion: there is no evidence of deviations in the genotype frequencies from null expectations, supporting the reliability on the genotype calls from tidyGenR.

Session Info

sessionInfo()

R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS 15.7.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] tidyGenR_0.1.7   hierfstat_0.5-11 lubridate_1.9.4  forcats_1.0.1   
 [5] stringr_1.6.0    dplyr_1.2.0      purrr_1.2.1      readr_2.1.6     
 [9] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.2    tidyverse_2.0.0 

loaded via a namespace (and not attached):
 [1] writexl_1.5.4               tidyselect_1.2.1           
 [3] farver_2.1.2                Biostrings_2.72.1          
 [5] S7_0.2.1                    bitops_1.0-9               
 [7] combinat_0.0-8              fastmap_1.2.0              
 [9] dada2_1.32.0                GenomicAlignments_1.40.0   
[11] digest_0.6.39               timechange_0.3.0           
[13] lifecycle_1.0.5             pwalign_1.0.0              
[15] gdata_3.0.1                 magrittr_2.0.4             
[17] compiler_4.4.0              rlang_1.1.7                
[19] tools_4.4.0                 yaml_2.3.12                
[21] knitr_1.51                  S4Arrays_1.4.1             
[23] htmlwidgets_1.6.4           interp_1.1-6               
[25] DelayedArray_0.30.1         plyr_1.8.9                 
[27] RColorBrewer_1.1-3          ShortRead_1.62.0           
[29] abind_1.4-8                 BiocParallel_1.38.0        
[31] withr_3.0.2                 hwriter_1.3.2.1            
[33] BiocGenerics_0.50.0         grid_4.4.0                 
[35] stats4_4.4.0                latticeExtra_0.6-31        
[37] colorspace_2.1-2            gtools_3.9.5               
[39] MASS_7.3-65                 scales_1.4.0               
[41] dichromat_2.0-0.1           SummarizedExperiment_1.34.0
[43] mvtnorm_1.3-3               cli_3.6.5                  
[45] rmarkdown_2.30              crayon_1.5.3               
[47] generics_0.1.4              RcppParallel_5.1.11-1      
[49] otel_0.2.0                  reshape2_1.4.5             
[51] httr_1.4.7                  tzdb_0.5.0                 
[53] genetics_1.3.8.1.3          DBI_1.2.3                  
[55] zlibbioc_1.50.0             parallel_4.4.0             
[57] XVector_0.44.0              matrixStats_1.5.0          
[59] vctrs_0.7.1                 Matrix_1.7-4               
[61] jsonlite_2.0.0              IRanges_2.38.1             
[63] hms_1.1.4                   patchwork_1.3.2            
[65] S4Vectors_0.42.1            jpeg_0.1-11                
[67] glue_1.8.0                  codetools_0.2-20           
[69] stringi_1.8.7               gtable_0.3.6               
[71] GenomeInfoDb_1.40.1         deldir_2.0-4               
[73] GenomicRanges_1.56.2        UCSC.utils_1.0.0           
[75] pillar_1.11.1               htmltools_0.5.9            
[77] GenomeInfoDbData_1.2.12     R6_2.6.1                   
[79] evaluate_1.0.5              Biobase_2.64.0             
[81] lattice_0.22-7              png_0.1-8                  
[83] Rsamtools_2.20.0            DECIPHER_3.0.0             
[85] Rcpp_1.1.1                  SparseArray_1.4.8          
[87] xfun_0.56                   MatrixGenerics_1.16.0      
[89] pkgconfig_2.0.3