Code
# Attach libraries and functions:
library(tidyverse)
library(hierfstat)
library(tidyGenR)
source("code/gen2hierfstat.R")Miguel Camacho Sánchez
Several approaches were used to assess the biological plausibility of the genotypes inferred by tidyGenR.
The recurrence of each inferred allele across samples was evaluated, as alleles observed in multiple individuals are less likely to represent spurious PCR artifacts (Sommer et al., 2013). For each allele, the number of samples in which it was detected was computed.
gen <- readRDS("output/genotypes_x.rds")$ind_bs0
allele_rec <-
gen |>
dplyr::select(sample, locus, allele) |>
group_by(locus, allele) |>
reframe(frequency = n(),
recurrence = length(unique(sample))
) |>
arrange(frequency)
knitr::kable(head(allele_rec))| locus | allele | frequency | recurrence |
|---|---|---|---|
| tmem87a | 03 | 1 | 1 |
| tmem87a | 04 | 1 | 1 |
| il34 | 03 | 2 | 2 |
| gadd45g | 02 | 3 | 3 |
| klc2 | 02 | 3 | 3 |
| mmp9 | 02 | 4 | 4 |
Only 2 allele/s were present in an unique sample, whereas the remaining 48 alleles were present in more than one sample.
Inferred haplotypes were compared against previously published sequence data for the same species but other populations (Camacho-Sanchez et al., 2018, to assess concordance with independently generated datasets.
# genotype data from Camacho-Sanchez et al. 2026, including other populations of R. baluensis.
# trusmadi would correspond to the the data from here.
# reformat data
gen26 <-
readRDS("data/raw/genotypes_camacho-2026.rds") |>
filter(locus != "fetub") |>
rowwise() |>
mutate(loc_al = paste0(locus, allele)) |>
select(population, loc_al) |>
distinct()
# alleles for trusmadi
al_tm <-
gen26 |>
filter(population == "trusmadi") |>
pull(loc_al)
# alleles for non-trusmadi
al_others <-
gen26 |>
filter(population != "trusmadi") |>
pull(loc_al)
# shared
al_shared <-
intersect(al_tm, al_others)
# unique to trusmadi
al_unique <-
setdiff(al_tm, al_others)A total of 32 haplotypes were shared with previously published sequences, whereas 16 were unique to this dataset.
Finally, departures from Hardy–Weinberg equilibrium (HWE) were evaluated as a downstream diagnostic of genotype reliability. As all samples analysed here originate from a single genetic population (Camacho-Sanchez et al., 2026), systematic locus-wise deviations from equilibrium expectations may indicate genotyping artefacts.
Observed (Ho) and expected (He) heterozygosities were estimated for each locus, and the inbreeding coefficient (Fis).
# Ho, He and Fis per locus
# convert to hierfstat
genhf <-
gen |>
mutate(population = "trusmadi") |> # population col is mandatory for gen_tidy2hierfstat
gen_tidy2hierfstat()
# compute stats
gen_div <-
genhf |>
dplyr::select(-sample, -population) |>
basic.stats()
# clean table
gen_div_clean <-
gen_div$perloc |>
select(Ho, Hs, Fis) |>
mutate_all(~round(.x, 3)) |>
rownames_to_column("locus")
knitr::kable(head(gen_div_clean))| locus | Ho | Hs | Fis |
|---|---|---|---|
| abcg8 | 0.651 | 0.598 | -0.088 |
| alkbh7 | 0.000 | 0.000 | NaN |
| apeh17 | 0.000 | 0.000 | NaN |
| cd27 | 0.182 | 0.167 | -0.089 |
| chrna9 | 0.386 | 0.412 | 0.062 |
| dhrs3 | 0.000 | 0.000 | NaN |
Departures from HWE were further assessed using chi-squared tests on the observed and expected genotypes:
# reformat genotypes for genetics package.
hwdf <-
gen |>
gen_tidy2integers() |>
gen_tidy2compact()
loci <-
unique(hwdf$locus)
# loop to compute hwe chisq per locus
hw_all_loci <-
loci |>
lapply(function(loc_i) {
locus_i <-
filter(hwdf, locus == loc_i) |>
pull(genotype)
lgen <-
genetics::genotype(locus_i)
if (length(attr(lgen, "allele.names")) > 1) {
genetics::HWE.chisq(lgen)
} else {
NULL
}
}) |>
setNames(loci)
# remove monomorphic
hw_all_loci <-
hw_all_loci[!sapply(hw_all_loci, is.null)]
# reformat hw table
hw_res <-
hw_all_loci |>
plyr::ldply(.id = "locus", function(x) {
data.frame(chi_sq = x$statistic,
p_val = x$p.value)
})
knitr::kable(head(hw_res))| locus | chi_sq | p_val |
|---|---|---|
| abcg8 | 4.2216592 | 0.3735626 |
| cd27 | 0.4400000 | 1.0000000 |
| chrna9 | 0.1106995 | 0.7162284 |
| fetub | 30.5555556 | 0.0001000 |
| fn3krp | 0.0478203 | 1.0000000 |
| gadd45g | 0.0548097 | 1.0000000 |
Merge results from heterozygosity and HWE:
merged_rs <-
gen_div_clean |>
dplyr::select(locus, Ho, Hs, Fis) |>
left_join(hw_res, by = "locus")
knitr::kable(merged_rs, digits = 2)| locus | Ho | Hs | Fis | chi_sq | p_val |
|---|---|---|---|---|---|
| abcg8 | 0.65 | 0.60 | -0.09 | 4.22 | 0.37 |
| alkbh7 | 0.00 | 0.00 | NaN | NA | NA |
| apeh17 | 0.00 | 0.00 | NaN | NA | NA |
| cd27 | 0.18 | 0.17 | -0.09 | 0.44 | 1.00 |
| chrna9 | 0.39 | 0.41 | 0.06 | 0.11 | 0.72 |
| dhrs3 | 0.00 | 0.00 | NaN | NA | NA |
| fancg | 0.00 | 0.00 | NaN | NA | NA |
| fetub | 0.91 | 0.50 | -0.83 | 30.56 | 0.00 |
| fn3krp | 0.48 | 0.50 | 0.04 | 0.05 | 1.00 |
| gadd45g | 0.07 | 0.07 | -0.02 | 0.05 | 1.00 |
| il34 | 0.34 | 0.38 | 0.10 | 1.20 | 0.80 |
| irf5 | 0.00 | 0.00 | NaN | NA | NA |
| klc2 | 0.07 | 0.07 | -0.02 | 0.05 | 1.00 |
| mmp9 | 0.45 | 0.40 | -0.14 | 2.21 | 0.66 |
| ms4a2 | 0.00 | 0.00 | NaN | NA | NA |
| mycbpap | 0.00 | 0.00 | NaN | NA | NA |
| nfkbia | 0.52 | 0.54 | 0.03 | 0.72 | 0.89 |
| npr2 | 0.30 | 0.34 | 0.14 | 0.71 | 0.37 |
| p2rx1 | 0.41 | 0.42 | 0.03 | 0.01 | 1.00 |
| pipox | 0.00 | 0.00 | NaN | NA | NA |
| ptgs2 | 0.00 | 0.00 | NaN | NA | NA |
| rabac1 | 0.32 | 0.33 | 0.03 | 0.02 | 1.00 |
| rgd735029 | 0.34 | 0.32 | -0.08 | 1.01 | 1.00 |
| rogdi | 0.00 | 0.00 | NaN | NA | NA |
| ssfa2 | 0.00 | 0.00 | NaN | NA | NA |
| tmem87a | 0.42 | 0.38 | -0.09 | 4.60 | 0.17 |
| usp20 | 0.30 | 0.25 | -0.16 | 1.32 | 0.19 |
Results:
All loci had similar values of expected and observed heterozygosity.
All polymorphic loci had genotype frequencies under HWE expectations (all p-values > 0.05).
The only locus with an abnormal behaviour is fetub, which as described previously (Camacho-Sanchez et al. 2018), its high Ho seems to correspond to multiple loci.
Conclusion: there is no evidence of deviations in the genotype frequencies from null expectations, supporting the reliability on the genotype calls from tidyGenR.
R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS 15.7.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8
time zone: Europe/Berlin
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidyGenR_0.1.7 hierfstat_0.5-11 lubridate_1.9.4 forcats_1.0.1
[5] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.6
[9] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] writexl_1.5.4 tidyselect_1.2.1
[3] farver_2.1.2 Biostrings_2.72.1
[5] S7_0.2.1 bitops_1.0-9
[7] combinat_0.0-8 fastmap_1.2.0
[9] dada2_1.32.0 GenomicAlignments_1.40.0
[11] digest_0.6.39 timechange_0.3.0
[13] lifecycle_1.0.5 pwalign_1.0.0
[15] gdata_3.0.1 magrittr_2.0.4
[17] compiler_4.4.0 rlang_1.1.7
[19] tools_4.4.0 yaml_2.3.12
[21] knitr_1.51 S4Arrays_1.4.1
[23] htmlwidgets_1.6.4 interp_1.1-6
[25] DelayedArray_0.30.1 plyr_1.8.9
[27] RColorBrewer_1.1-3 ShortRead_1.62.0
[29] abind_1.4-8 BiocParallel_1.38.0
[31] withr_3.0.2 hwriter_1.3.2.1
[33] BiocGenerics_0.50.0 grid_4.4.0
[35] stats4_4.4.0 latticeExtra_0.6-31
[37] colorspace_2.1-2 gtools_3.9.5
[39] MASS_7.3-65 scales_1.4.0
[41] dichromat_2.0-0.1 SummarizedExperiment_1.34.0
[43] mvtnorm_1.3-3 cli_3.6.5
[45] rmarkdown_2.30 crayon_1.5.3
[47] generics_0.1.4 RcppParallel_5.1.11-1
[49] otel_0.2.0 reshape2_1.4.5
[51] httr_1.4.7 tzdb_0.5.0
[53] genetics_1.3.8.1.3 DBI_1.2.3
[55] zlibbioc_1.50.0 parallel_4.4.0
[57] XVector_0.44.0 matrixStats_1.5.0
[59] vctrs_0.7.1 Matrix_1.7-4
[61] jsonlite_2.0.0 IRanges_2.38.1
[63] hms_1.1.4 patchwork_1.3.2
[65] S4Vectors_0.42.1 jpeg_0.1-11
[67] glue_1.8.0 codetools_0.2-20
[69] stringi_1.8.7 gtable_0.3.6
[71] GenomeInfoDb_1.40.1 deldir_2.0-4
[73] GenomicRanges_1.56.2 UCSC.utils_1.0.0
[75] pillar_1.11.1 htmltools_0.5.9
[77] GenomeInfoDbData_1.2.12 R6_2.6.1
[79] evaluate_1.0.5 Biobase_2.64.0
[81] lattice_0.22-7 png_0.1-8
[83] Rsamtools_2.20.0 DECIPHER_3.0.0
[85] Rcpp_1.1.1 SparseArray_1.4.8
[87] xfun_0.56 MatrixGenerics_1.16.0
[89] pkgconfig_2.0.3
---
author: Miguel Camacho Sánchez
title: "Biological plausibility"
---
```{r}
#| echo: false
#| include: false
#knitr::opts_knit$set(root.dir = "../")
```
```{r}
#| message: false
#| warning: false
#| eval: true
#| code-fold: true
# Attach libraries and functions:
library(tidyverse)
library(hierfstat)
library(tidyGenR)
source("code/gen2hierfstat.R")
```
## Reliability of the inferred genotypes
Several approaches were used to assess the biological plausibility of the genotypes inferred by tidyGenR.
### Allele recurrence
The recurrence of each inferred allele across samples was evaluated, as alleles observed in multiple individuals are less likely to represent spurious PCR artifacts ([Sommer et al., 2013](https://doi.org/10.1186/1471-2164-14-542)). For each allele, the number of samples in which it was detected was computed.
```{r}
#| label: allele-recurrence
#| warning: false
#| message: false
gen <- readRDS("output/genotypes_x.rds")$ind_bs0
allele_rec <-
gen |>
dplyr::select(sample, locus, allele) |>
group_by(locus, allele) |>
reframe(frequency = n(),
recurrence = length(unique(sample))
) |>
arrange(frequency)
knitr::kable(head(allele_rec))
```
Only `r sum(allele_rec$recurrence == 1 )` allele/s were present in an unique sample, whereas the remaining `r sum(allele_rec$recurrence > 1 )` alleles were present in more than one sample.
### Haplotype concordance
Inferred haplotypes were compared against previously published sequence data for the same species but other populations ([Camacho-Sanchez et al., 2018](https://doi.org/10.1111/ddi.12761Digital%20Object%20Identifier%20(DOI)), to assess concordance with independently generated datasets.
```{r}
#| label: haplotype-concordance
#| warning: false
#| message: false
# genotype data from Camacho-Sanchez et al. 2026, including other populations of R. baluensis.
# trusmadi would correspond to the the data from here.
# reformat data
gen26 <-
readRDS("data/raw/genotypes_camacho-2026.rds") |>
filter(locus != "fetub") |>
rowwise() |>
mutate(loc_al = paste0(locus, allele)) |>
select(population, loc_al) |>
distinct()
# alleles for trusmadi
al_tm <-
gen26 |>
filter(population == "trusmadi") |>
pull(loc_al)
# alleles for non-trusmadi
al_others <-
gen26 |>
filter(population != "trusmadi") |>
pull(loc_al)
# shared
al_shared <-
intersect(al_tm, al_others)
# unique to trusmadi
al_unique <-
setdiff(al_tm, al_others)
```
A total of `r length(al_shared)` haplotypes were shared with previously published sequences, whereas `r length(al_unique)` were unique to this dataset.
### Heterozygosity
Finally, departures from Hardy–Weinberg equilibrium (HWE) were evaluated as a downstream diagnostic of genotype reliability. As all samples analysed here originate from a single genetic population ([Camacho-Sanchez et al., 2026](https://doi.org/10.1093/jhered/esaf106)), systematic locus-wise deviations from equilibrium expectations may indicate genotyping artefacts.
Observed (*Ho*) and expected (*He*) heterozygosities were estimated for each locus, and the inbreeding coefficient (*Fis*).
```{r}
#| label: heterozygosity
#| warning: false
#| message: false
# Ho, He and Fis per locus
# convert to hierfstat
genhf <-
gen |>
mutate(population = "trusmadi") |> # population col is mandatory for gen_tidy2hierfstat
gen_tidy2hierfstat()
# compute stats
gen_div <-
genhf |>
dplyr::select(-sample, -population) |>
basic.stats()
# clean table
gen_div_clean <-
gen_div$perloc |>
select(Ho, Hs, Fis) |>
mutate_all(~round(.x, 3)) |>
rownames_to_column("locus")
knitr::kable(head(gen_div_clean))
```
### HWE
Departures from HWE were further assessed using chi-squared tests on the observed and expected genotypes:
```{r}
#| label: hwe-tests
#| warning: false
#| message: false
# reformat genotypes for genetics package.
hwdf <-
gen |>
gen_tidy2integers() |>
gen_tidy2compact()
loci <-
unique(hwdf$locus)
# loop to compute hwe chisq per locus
hw_all_loci <-
loci |>
lapply(function(loc_i) {
locus_i <-
filter(hwdf, locus == loc_i) |>
pull(genotype)
lgen <-
genetics::genotype(locus_i)
if (length(attr(lgen, "allele.names")) > 1) {
genetics::HWE.chisq(lgen)
} else {
NULL
}
}) |>
setNames(loci)
# remove monomorphic
hw_all_loci <-
hw_all_loci[!sapply(hw_all_loci, is.null)]
# reformat hw table
hw_res <-
hw_all_loci |>
plyr::ldply(.id = "locus", function(x) {
data.frame(chi_sq = x$statistic,
p_val = x$p.value)
})
knitr::kable(head(hw_res))
```
Merge results from heterozygosity and HWE:
```{r}
#| message: false
merged_rs <-
gen_div_clean |>
dplyr::select(locus, Ho, Hs, Fis) |>
left_join(hw_res, by = "locus")
knitr::kable(merged_rs, digits = 2)
```
```{r}
#| eval: true
#| echo: false
writexl::write_xlsx(merged_rs, "output/heterozygosity.xlsx")
```
Results:
- All loci had similar values of expected and observed heterozygosity.
- All polymorphic loci had genotype frequencies under HWE expectations (all p-values \> 0.05).
The only locus with an abnormal behaviour is *fetub*, which as described previously (Camacho-Sanchez et al. 2018), its high *Ho* seems to correspond to multiple loci.
**Conclusion**: there is no evidence of deviations in the genotype frequencies from null expectations, supporting the reliability on the genotype calls from *tidyGenR*.
# Session Info
```{r}
sessionInfo()
```