Summary tables

Author

Miguel Camacho Sánchez

tidyGenR includes several functions to explore the sequencing data and create summary tables from the sequencing data created along the workflow: reads_track() and reads_loci_samples(), dereplicate().

# attach packages
library(tidyGenR)
library(dplyr)
library(writexl)

# load tidy variants dataset
tidy_variants <- readRDS("output/variants_x.rds")$ind_bs0

Track reads throught the workflow

The sequencing reads went through different steps in the workflow: raw -> demultiplexed -> truncated -> variant_calling At each step some data can be lost, edited, or filtered. This leads to a decrease of the number of reads from the initial step to the last. It is useful to track the reads through the pipeline to detect potential steps that might have gone wrong or could be optimized. We tracked the reads from the initial 44 F/R raw FASTQ to the filtered variants:

# create list with ordered elements:
l <-
  list(
  "raw"           = c("data/raw",
                      "1.fastq"), # raw reads
  "demultiplexed" = c("data/intermediate/demultiplexed",
                      "1.fastq.gz"),
  "truncated"     = c("data/intermediate/truncated",
                      "F_filt.fastq.gz"),
  "supp_variants" = tidy_variants
  )
# track reads
reads_tracked <- reads_track(l)

# glimpse
knitr::kable(head(reads_tracked, 3))

sample	raw	demultiplexed	truncated	supp_variants
BOR1061	86716	76207	45939	45459
BOR1063	83553	73917	44798	44345
BOR1069	84444	72497	43782	43379

Distribution of reads across samples and loci

The distribution of reads across samples and loci is not homogeneous. Indeed, it can be highly skewed towards some loci. Their distribution provides important information about potential biases and help to interpret the data.

We used reads_loci_samples() to get the depth of each locus and sample for demultiplexed reads, an was Demultiplexed reads can

# retrieve depth of demultiplexed reads per locus
reads_demultiplx <-
  reads_loci_samples("data/intermediate/demultiplexed",
                     pattern_fq = "1.fastq.gz")

'all.variants' and 'var_id' are ignored when setting 'path' to a folder.

# retrieve filtered variants reads per locus and sample.
#   loci are split with one variant per column when 'all.variants = T'.
reads_variants <-
  reads_loci_samples(tidy_variants,
                     all.variants = TRUE,
                     var_id = "variant")

Dereplicated

De-replicated sequences are unique sequences in the read pool. Their count and distribution across FASTQ files can be retrieved and written to tables with dereplicate(). This step was performed with truncated FASTQ previous to variant calling with the aim to have tables to de-replicated frequencies to check for consistency against the variants called.

# files to dereplicate
fp <- "data/intermediate/truncated"
fs <- list.files(fp, pattern = "fastq.gz", full.names = T)
drep_xls <- "output/dereplicated-truncated.xlsx"
# run dereplicate
drep_tab <-
  dereplicate(fs,
            min_sam_fr = 5, # a 5x is needed for a read count to be output in the table.
            min_loc_fr = 0.002, # min fr of de-replicated sequence across all samples in the group.
            by = "_([a-zA-Z0-9]*_[F|R])", # group by locus + F/R
            out_xlsx = drep_xls)

De-replicated sequences written to output/dereplicated-truncated.xlsx.
If the number of sheets in too large EXCEL might crash at opening.

# glimpse
knitr::kable(head(drep_tab[[1]], 3))

sequence	md5	BOR1061_abcg8_F_filt.fastq.gz	BOR1063_abcg8_F_filt.fastq.gz	BOR1069_abcg8_F_filt.fastq.gz	BOR1070_abcg8_F_filt.fastq.gz	BOR1071_abcg8_F_filt.fastq.gz	BOR1074_abcg8_F_filt.fastq.gz	BOR1075_abcg8_F_filt.fastq.gz	BOR1076_abcg8_F_filt.fastq.gz	BOR1077_abcg8_F_filt.fastq.gz	BOR1079_abcg8_F_filt.fastq.gz	BOR1080_abcg8_F_filt.fastq.gz	BOR1082_abcg8_F_filt.fastq.gz	BOR1086_abcg8_F_filt.fastq.gz	BOR1088_abcg8_F_filt.fastq.gz	BOR1090_abcg8_F_filt.fastq.gz	BOR1091_abcg8_F_filt.fastq.gz	BOR1097_abcg8_F_filt.fastq.gz	BOR1098_abcg8_F_filt.fastq.gz	BOR1101_abcg8_F_filt.fastq.gz	BOR1107_abcg8_F_filt.fastq.gz	BOR1111_abcg8_F_filt.fastq.gz	BOR1113_abcg8_F_filt.fastq.gz	BOR1115_abcg8_F_filt.fastq.gz	BOR1125_abcg8_F_filt.fastq.gz	BOR1127_abcg8_F_filt.fastq.gz	BOR1129_abcg8_F_filt.fastq.gz	BOR1131_abcg8_F_filt.fastq.gz	BOR1133_abcg8_F_filt.fastq.gz	BOR1135_abcg8_F_filt.fastq.gz	BOR1137_abcg8_F_filt.fastq.gz	BOR1139_abcg8_F_filt.fastq.gz	BOR1141_abcg8_F_filt.fastq.gz	BOR1143_abcg8_F_filt.fastq.gz	BOR1147_abcg8_F_filt.fastq.gz	BOR1149_abcg8_F_filt.fastq.gz	BOR1151_abcg8_F_filt.fastq.gz	BOR1155_abcg8_F_filt.fastq.gz	BOR614_abcg8_F_filt.fastq.gz	BOR615_abcg8_F_filt.fastq.gz	BOR616_abcg8_F_filt.fastq.gz	BOR618_abcg8_F_filt.fastq.gz	BOR619_abcg8_F_filt.fastq.gz	BOR621_abcg8_F_filt.fastq.gz	BOR626_abcg8_F_filt.fastq.gz
TTGCCCACCCTGTTCATCCATGGAGCAGAAGCCTGCCTGATGTCTCTCATCATTGGCTTCCTTTACTACGGCCACGCAGACAAGCCGCTCTCCTTCGTGGACATGGCAGCCCTCCTGTTCATGATAGGAGCGCTCATTCCTTTTAATGTCATTCTGGATGTCGTCTCCAAATGTGAGTGTCACCCGCCCTCCTCACCAGACATCGGGACAGTGGGACAGCCTCCCTGGGCACTGCACTGAGGCCAAGCTCTGTGCTTCCGCTGGTACCCA	79cdeae19796f127012f5a6d9d8fe5ba	0	0	332	261	281	261	461	329	197	441	222	555	196	374	0	227	490	633	347	556	0	0	0	0	0	388	341	618	318	665	323	0	330	238	0	577	264	454	494	569	1123	328	311	205
TTGCCCACCCTGTTCATCCATGGAGCAGAAGCCTGCCTGATGTCTCTCATCATTGGCTTCCTTTACTACGGCCACGCAGATAAGCCGCTCTCCTTCGTGGACATGGCAGCCCTCCTGTTCATGATAGGAGCGCTCATTCCTTTTAATGTCATTCTGGATGTCGTCTCCAAATGTGAGTGTCACCCGCCCTCCTCACCAGACATCGGGACAGTGGGACAGCCTCCCTGGGCACTGCACTGAGGCCAAGCTCTGTGCTTCCGCTGGTACCCA	559fd72c1b84037c883fdd6053482736	680	657	306	264	229	273	0	0	198	0	0	0	0	0	251	251	0	0	333	0	605	264	0	405	388	390	365	0	298	0	336	332	132	236	204	0	207	484	493	489	0	325	259	197
TTGCCCACCCTGTTCATCCATGGAGCAGAAGCCTGCCTGATGTCTCTCATCATTGGCTTCCTTTACTACGGCCACGCAGACAAGCCGCTCTCCTTCGTGGACATGGCAGCCCTCCTGTTCATGATAGGAGCTCTCATTCCTTTTAATGTCATTCTGGATGTCGTCTCCAAATGTGAGTGTCACCCGCCCTCCTCACCAGACATCGGGACAGTGGGACAGCCTCCCTGGGCACTGCACTGAGGCCAAGCTCTGTGCTTCCGCTGGTACCCA	f9c9a80d4e1fa27cd4ce16e0cb9f6da9	0	0	0	0	0	0	0	334	0	0	227	0	210	0	244	0	0	0	0	0	0	305	601	390	368	0	0	0	0	0	0	332	202	0	223	0	0	0	0	0	0	0	0	0

Agglomerate tables in an EXCEL

Tables above were combined in a unique EXCEL file that will be included in the manuscript as supplementary material.

# ST1. tracked reads.
# ST2. demultiplexed reads.
# ST3. reads supporting variants.
suppl_tables <-
  list(
    "ST1. Tracked reads" = reads_tracked,
    "ST2. Demultiplexed- per locus" = reads_demultiplx,
    "ST3. Filtered variants" = reads_variants
  )
write_xlsx(suppl_tables,
           path = "output/supplementary_tables.xlsx")

Session Info

sessionInfo()

R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS 15.7.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] writexl_1.5.4  dplyr_1.2.0    tidyGenR_0.1.7

loaded via a namespace (and not attached):
 [1] SummarizedExperiment_1.34.0 gtable_0.3.6               
 [3] xfun_0.56                   hwriter_1.3.2.1            
 [5] ggplot2_4.0.2               htmlwidgets_1.6.4          
 [7] latticeExtra_0.6-31         Biobase_2.64.0             
 [9] lattice_0.22-7              tzdb_0.5.0                 
[11] vctrs_0.7.1                 tools_4.4.0                
[13] bitops_1.0-9                generics_0.1.4             
[15] stats4_4.4.0                parallel_4.4.0             
[17] tibble_3.3.1                pkgconfig_2.0.3            
[19] Matrix_1.7-4                RColorBrewer_1.1-3         
[21] S7_0.2.1                    S4Vectors_0.42.1           
[23] RcppParallel_5.1.11-1       lifecycle_1.0.5            
[25] GenomeInfoDbData_1.2.12     stringr_1.6.0              
[27] deldir_2.0-4                compiler_4.4.0             
[29] farver_2.1.2                Rsamtools_2.20.0           
[31] Biostrings_2.72.1           dada2_1.32.0               
[33] codetools_0.2-20            DECIPHER_3.0.0             
[35] GenomeInfoDb_1.40.1         htmltools_0.5.9            
[37] yaml_2.3.12                 tidyr_1.3.2                
[39] pillar_1.11.1               crayon_1.5.3               
[41] BiocParallel_1.38.0         DelayedArray_0.30.1        
[43] ShortRead_1.62.0            abind_1.4-8                
[45] tidyselect_1.2.1            digest_0.6.39              
[47] stringi_1.8.7               purrr_1.2.1                
[49] reshape2_1.4.5              fastmap_1.2.0              
[51] grid_4.4.0                  colorspace_2.1-2           
[53] cli_3.6.5                   SparseArray_1.4.8          
[55] magrittr_2.0.4              patchwork_1.3.2            
[57] S4Arrays_1.4.1              dichromat_2.0-0.1          
[59] withr_3.0.2                 readr_2.1.6                
[61] scales_1.4.0                UCSC.utils_1.0.0           
[63] rmarkdown_2.30              pwalign_1.0.0              
[65] XVector_0.44.0              httr_1.4.7                 
[67] jpeg_0.1-11                 matrixStats_1.5.0          
[69] interp_1.1-6                otel_0.2.0                 
[71] hms_1.1.4                   png_0.1-8                  
[73] evaluate_1.0.5              knitr_1.51                 
[75] GenomicRanges_1.56.2        IRanges_2.38.1             
[77] rlang_1.1.7                 Rcpp_1.1.1                 
[79] glue_1.8.0                  DBI_1.2.3                  
[81] BiocGenerics_0.50.0         jsonlite_2.0.0             
[83] plyr_1.8.9                  R6_2.6.1                   
[85] MatrixGenerics_1.16.0       GenomicAlignments_1.40.0   
[87] zlibbioc_1.50.0