# attach packages
library(tidyGenR)
library(dplyr)
library(writexl)
# load tidy variants dataset
tidy_variants <- readRDS("output/variants_x.rds")$ind_bs0Summary tables
tidyGenR includes several functions to explore the sequencing data and create summary tables from the sequencing data created along the workflow: reads_track() and reads_loci_samples(), dereplicate().
Track reads throught the workflow
The sequencing reads went through different steps in the workflow: raw -> demultiplexed -> truncated -> variant_calling At each step some data can be lost, edited, or filtered. This leads to a decrease of the number of reads from the initial step to the last. It is useful to track the reads through the pipeline to detect potential steps that might have gone wrong or could be optimized. We tracked the reads from the initial 44 F/R raw FASTQ to the filtered variants:
# create list with ordered elements:
l <-
list(
"raw" = c("data/raw",
"1.fastq"), # raw reads
"demultiplexed" = c("data/intermediate/demultiplexed",
"1.fastq.gz"),
"truncated" = c("data/intermediate/truncated",
"F_filt.fastq.gz"),
"supp_variants" = tidy_variants
)
# track reads
reads_tracked <- reads_track(l)
# glimpse
knitr::kable(head(reads_tracked, 3))| sample | raw | demultiplexed | truncated | supp_variants |
|---|---|---|---|---|
| BOR1061 | 86716 | 76207 | 45939 | 45459 |
| BOR1063 | 83553 | 73917 | 44798 | 44345 |
| BOR1069 | 84444 | 72497 | 43782 | 43379 |
Distribution of reads across samples and loci
The distribution of reads across samples and loci is not homogeneous. Indeed, it can be highly skewed towards some loci. Their distribution provides important information about potential biases and help to interpret the data.
We used reads_loci_samples() to get the depth of each locus and sample for demultiplexed reads, an was Demultiplexed reads can
# retrieve depth of demultiplexed reads per locus
reads_demultiplx <-
reads_loci_samples("data/intermediate/demultiplexed",
pattern_fq = "1.fastq.gz")'all.variants' and 'var_id' are ignored when setting 'path' to a folder.
# retrieve filtered variants reads per locus and sample.
# loci are split with one variant per column when 'all.variants = T'.
reads_variants <-
reads_loci_samples(tidy_variants,
all.variants = TRUE,
var_id = "variant")Dereplicated
De-replicated sequences are unique sequences in the read pool. Their count and distribution across FASTQ files can be retrieved and written to tables with dereplicate(). This step was performed with truncated FASTQ previous to variant calling with the aim to have tables to de-replicated frequencies to check for consistency against the variants called.
# files to dereplicate
fp <- "data/intermediate/truncated"
fs <- list.files(fp, pattern = "fastq.gz", full.names = T)
drep_xls <- "output/dereplicated-truncated.xlsx"
# run dereplicate
drep_tab <-
dereplicate(fs,
min_sam_fr = 5, # a 5x is needed for a read count to be output in the table.
min_loc_fr = 0.002, # min fr of de-replicated sequence across all samples in the group.
by = "_([a-zA-Z0-9]*_[F|R])", # group by locus + F/R
out_xlsx = drep_xls)De-replicated sequences written to output/dereplicated-truncated.xlsx.
If the number of sheets in too large EXCEL might crash at opening.
# glimpse
knitr::kable(head(drep_tab[[1]], 3))| sequence | md5 | BOR1061_abcg8_F_filt.fastq.gz | BOR1063_abcg8_F_filt.fastq.gz | BOR1069_abcg8_F_filt.fastq.gz | BOR1070_abcg8_F_filt.fastq.gz | BOR1071_abcg8_F_filt.fastq.gz | BOR1074_abcg8_F_filt.fastq.gz | BOR1075_abcg8_F_filt.fastq.gz | BOR1076_abcg8_F_filt.fastq.gz | BOR1077_abcg8_F_filt.fastq.gz | BOR1079_abcg8_F_filt.fastq.gz | BOR1080_abcg8_F_filt.fastq.gz | BOR1082_abcg8_F_filt.fastq.gz | BOR1086_abcg8_F_filt.fastq.gz | BOR1088_abcg8_F_filt.fastq.gz | BOR1090_abcg8_F_filt.fastq.gz | BOR1091_abcg8_F_filt.fastq.gz | BOR1097_abcg8_F_filt.fastq.gz | BOR1098_abcg8_F_filt.fastq.gz | BOR1101_abcg8_F_filt.fastq.gz | BOR1107_abcg8_F_filt.fastq.gz | BOR1111_abcg8_F_filt.fastq.gz | BOR1113_abcg8_F_filt.fastq.gz | BOR1115_abcg8_F_filt.fastq.gz | BOR1125_abcg8_F_filt.fastq.gz | BOR1127_abcg8_F_filt.fastq.gz | BOR1129_abcg8_F_filt.fastq.gz | BOR1131_abcg8_F_filt.fastq.gz | BOR1133_abcg8_F_filt.fastq.gz | BOR1135_abcg8_F_filt.fastq.gz | BOR1137_abcg8_F_filt.fastq.gz | BOR1139_abcg8_F_filt.fastq.gz | BOR1141_abcg8_F_filt.fastq.gz | BOR1143_abcg8_F_filt.fastq.gz | BOR1147_abcg8_F_filt.fastq.gz | BOR1149_abcg8_F_filt.fastq.gz | BOR1151_abcg8_F_filt.fastq.gz | BOR1155_abcg8_F_filt.fastq.gz | BOR614_abcg8_F_filt.fastq.gz | BOR615_abcg8_F_filt.fastq.gz | BOR616_abcg8_F_filt.fastq.gz | BOR618_abcg8_F_filt.fastq.gz | BOR619_abcg8_F_filt.fastq.gz | BOR621_abcg8_F_filt.fastq.gz | BOR626_abcg8_F_filt.fastq.gz |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TTGCCCACCCTGTTCATCCATGGAGCAGAAGCCTGCCTGATGTCTCTCATCATTGGCTTCCTTTACTACGGCCACGCAGACAAGCCGCTCTCCTTCGTGGACATGGCAGCCCTCCTGTTCATGATAGGAGCGCTCATTCCTTTTAATGTCATTCTGGATGTCGTCTCCAAATGTGAGTGTCACCCGCCCTCCTCACCAGACATCGGGACAGTGGGACAGCCTCCCTGGGCACTGCACTGAGGCCAAGCTCTGTGCTTCCGCTGGTACCCA | 79cdeae19796f127012f5a6d9d8fe5ba | 0 | 0 | 332 | 261 | 281 | 261 | 461 | 329 | 197 | 441 | 222 | 555 | 196 | 374 | 0 | 227 | 490 | 633 | 347 | 556 | 0 | 0 | 0 | 0 | 0 | 388 | 341 | 618 | 318 | 665 | 323 | 0 | 330 | 238 | 0 | 577 | 264 | 454 | 494 | 569 | 1123 | 328 | 311 | 205 |
| TTGCCCACCCTGTTCATCCATGGAGCAGAAGCCTGCCTGATGTCTCTCATCATTGGCTTCCTTTACTACGGCCACGCAGATAAGCCGCTCTCCTTCGTGGACATGGCAGCCCTCCTGTTCATGATAGGAGCGCTCATTCCTTTTAATGTCATTCTGGATGTCGTCTCCAAATGTGAGTGTCACCCGCCCTCCTCACCAGACATCGGGACAGTGGGACAGCCTCCCTGGGCACTGCACTGAGGCCAAGCTCTGTGCTTCCGCTGGTACCCA | 559fd72c1b84037c883fdd6053482736 | 680 | 657 | 306 | 264 | 229 | 273 | 0 | 0 | 198 | 0 | 0 | 0 | 0 | 0 | 251 | 251 | 0 | 0 | 333 | 0 | 605 | 264 | 0 | 405 | 388 | 390 | 365 | 0 | 298 | 0 | 336 | 332 | 132 | 236 | 204 | 0 | 207 | 484 | 493 | 489 | 0 | 325 | 259 | 197 |
| TTGCCCACCCTGTTCATCCATGGAGCAGAAGCCTGCCTGATGTCTCTCATCATTGGCTTCCTTTACTACGGCCACGCAGACAAGCCGCTCTCCTTCGTGGACATGGCAGCCCTCCTGTTCATGATAGGAGCTCTCATTCCTTTTAATGTCATTCTGGATGTCGTCTCCAAATGTGAGTGTCACCCGCCCTCCTCACCAGACATCGGGACAGTGGGACAGCCTCCCTGGGCACTGCACTGAGGCCAAGCTCTGTGCTTCCGCTGGTACCCA | f9c9a80d4e1fa27cd4ce16e0cb9f6da9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 334 | 0 | 0 | 227 | 0 | 210 | 0 | 244 | 0 | 0 | 0 | 0 | 0 | 0 | 305 | 601 | 390 | 368 | 0 | 0 | 0 | 0 | 0 | 0 | 332 | 202 | 0 | 223 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Agglomerate tables in an EXCEL
Tables above were combined in a unique EXCEL file that will be included in the manuscript as supplementary material.
# ST1. tracked reads.
# ST2. demultiplexed reads.
# ST3. reads supporting variants.
suppl_tables <-
list(
"ST1. Tracked reads" = reads_tracked,
"ST2. Demultiplexed- per locus" = reads_demultiplx,
"ST3. Filtered variants" = reads_variants
)
write_xlsx(suppl_tables,
path = "output/supplementary_tables.xlsx")Session Info
sessionInfo()R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS 15.7.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8
time zone: Europe/Berlin
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] writexl_1.5.4 dplyr_1.2.0 tidyGenR_0.1.7
loaded via a namespace (and not attached):
[1] SummarizedExperiment_1.34.0 gtable_0.3.6
[3] xfun_0.56 hwriter_1.3.2.1
[5] ggplot2_4.0.2 htmlwidgets_1.6.4
[7] latticeExtra_0.6-31 Biobase_2.64.0
[9] lattice_0.22-7 tzdb_0.5.0
[11] vctrs_0.7.1 tools_4.4.0
[13] bitops_1.0-9 generics_0.1.4
[15] stats4_4.4.0 parallel_4.4.0
[17] tibble_3.3.1 pkgconfig_2.0.3
[19] Matrix_1.7-4 RColorBrewer_1.1-3
[21] S7_0.2.1 S4Vectors_0.42.1
[23] RcppParallel_5.1.11-1 lifecycle_1.0.5
[25] GenomeInfoDbData_1.2.12 stringr_1.6.0
[27] deldir_2.0-4 compiler_4.4.0
[29] farver_2.1.2 Rsamtools_2.20.0
[31] Biostrings_2.72.1 dada2_1.32.0
[33] codetools_0.2-20 DECIPHER_3.0.0
[35] GenomeInfoDb_1.40.1 htmltools_0.5.9
[37] yaml_2.3.12 tidyr_1.3.2
[39] pillar_1.11.1 crayon_1.5.3
[41] BiocParallel_1.38.0 DelayedArray_0.30.1
[43] ShortRead_1.62.0 abind_1.4-8
[45] tidyselect_1.2.1 digest_0.6.39
[47] stringi_1.8.7 purrr_1.2.1
[49] reshape2_1.4.5 fastmap_1.2.0
[51] grid_4.4.0 colorspace_2.1-2
[53] cli_3.6.5 SparseArray_1.4.8
[55] magrittr_2.0.4 patchwork_1.3.2
[57] S4Arrays_1.4.1 dichromat_2.0-0.1
[59] withr_3.0.2 readr_2.1.6
[61] scales_1.4.0 UCSC.utils_1.0.0
[63] rmarkdown_2.30 pwalign_1.0.0
[65] XVector_0.44.0 httr_1.4.7
[67] jpeg_0.1-11 matrixStats_1.5.0
[69] interp_1.1-6 otel_0.2.0
[71] hms_1.1.4 png_0.1-8
[73] evaluate_1.0.5 knitr_1.51
[75] GenomicRanges_1.56.2 IRanges_2.38.1
[77] rlang_1.1.7 Rcpp_1.1.1
[79] glue_1.8.0 DBI_1.2.3
[81] BiocGenerics_0.50.0 jsonlite_2.0.0
[83] plyr_1.8.9 R6_2.6.1
[85] MatrixGenerics_1.16.0 GenomicAlignments_1.40.0
[87] zlibbioc_1.50.0